mirror of
https://github.com/arsenetar/dupeguru.git
synced 2025-03-10 05:34:36 +00:00
Changed the MatchFactory into a simple getmatch method, and added a separate getmatches_by_contents() method for contents scan, which results in faster and less memory hungry scans.
--HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40192
This commit is contained in:
parent
3f34dab881
commit
7228adf433
@ -79,7 +79,7 @@ def GetDisplayInfo(dupe, group, delta):
|
|||||||
format_timestamp(ctime, delta and m),
|
format_timestamp(ctime, delta and m),
|
||||||
format_timestamp(mtime, delta and m),
|
format_timestamp(mtime, delta and m),
|
||||||
format_perc(percentage),
|
format_perc(percentage),
|
||||||
format_words(dupe.words),
|
format_words(dupe.words) if hasattr(dupe, 'words') else '',
|
||||||
format_dupe_count(dupe_count)
|
format_dupe_count(dupe_count)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -76,7 +76,7 @@ def GetDisplayInfo(dupe, group, delta):
|
|||||||
str(dupe.track),
|
str(dupe.track),
|
||||||
dupe.comment,
|
dupe.comment,
|
||||||
format_perc(percentage),
|
format_perc(percentage),
|
||||||
format_words(dupe.words),
|
format_words(dupe.words) if hasattr(dupe, 'words') else '',
|
||||||
format_dupe_count(dupe_count)
|
format_dupe_count(dupe_count)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import difflib
|
import difflib
|
||||||
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import string
|
import string
|
||||||
from collections import defaultdict, namedtuple
|
from collections import defaultdict, namedtuple
|
||||||
@ -156,58 +157,69 @@ def get_match(first, second, flags=()):
|
|||||||
percentage = compare(first.words, second.words, flags)
|
percentage = compare(first.words, second.words, flags)
|
||||||
return Match(first, second, percentage)
|
return Match(first, second, percentage)
|
||||||
|
|
||||||
class MatchFactory(object):
|
def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False,
|
||||||
common_word_threshold = 50
|
no_field_order=False, j=job.nulljob):
|
||||||
match_similar_words = False
|
COMMON_WORD_THRESHOLD = 50
|
||||||
min_match_percentage = 0
|
LIMIT = 5000000
|
||||||
weight_words = False
|
j = j.start_subjob(2)
|
||||||
no_field_order = False
|
sj = j.start_subjob(2)
|
||||||
limit = 5000000
|
for o in objects:
|
||||||
|
if not hasattr(o, 'words'):
|
||||||
def getmatches(self, objects, j=job.nulljob):
|
o.words = getwords(o.name)
|
||||||
j = j.start_subjob(2)
|
word_dict = build_word_dict(objects, sj)
|
||||||
sj = j.start_subjob(2)
|
reduce_common_words(word_dict, COMMON_WORD_THRESHOLD)
|
||||||
for o in objects:
|
if match_similar_words:
|
||||||
if not hasattr(o, 'words'):
|
merge_similar_words(word_dict)
|
||||||
o.words = getwords(o.name)
|
match_flags = []
|
||||||
word_dict = build_word_dict(objects, sj)
|
if weight_words:
|
||||||
reduce_common_words(word_dict, self.common_word_threshold)
|
match_flags.append(WEIGHT_WORDS)
|
||||||
if self.match_similar_words:
|
if match_similar_words:
|
||||||
merge_similar_words(word_dict)
|
match_flags.append(MATCH_SIMILAR_WORDS)
|
||||||
match_flags = []
|
if no_field_order:
|
||||||
if self.weight_words:
|
match_flags.append(NO_FIELD_ORDER)
|
||||||
match_flags.append(WEIGHT_WORDS)
|
j.start_job(len(word_dict), '0 matches found')
|
||||||
if self.match_similar_words:
|
compared = defaultdict(set)
|
||||||
match_flags.append(MATCH_SIMILAR_WORDS)
|
result = []
|
||||||
if self.no_field_order:
|
try:
|
||||||
match_flags.append(NO_FIELD_ORDER)
|
# This whole 'popping' thing is there to avoid taking too much memory at the same time.
|
||||||
j.start_job(len(word_dict), '0 matches found')
|
while word_dict:
|
||||||
compared = defaultdict(set)
|
items = word_dict.popitem()[1]
|
||||||
result = []
|
while items:
|
||||||
try:
|
ref = items.pop()
|
||||||
# This whole 'popping' thing is there to avoid taking too much memory at the same time.
|
compared_already = compared[ref]
|
||||||
while word_dict:
|
to_compare = items - compared_already
|
||||||
items = word_dict.popitem()[1]
|
compared_already |= to_compare
|
||||||
while items:
|
for other in to_compare:
|
||||||
ref = items.pop()
|
m = get_match(ref, other, match_flags)
|
||||||
compared_already = compared[ref]
|
if m.percentage >= min_match_percentage:
|
||||||
to_compare = items - compared_already
|
result.append(m)
|
||||||
compared_already |= to_compare
|
if len(result) >= LIMIT:
|
||||||
for other in to_compare:
|
return result
|
||||||
m = get_match(ref, other, match_flags)
|
j.add_progress(desc='%d matches found' % len(result))
|
||||||
if m.percentage >= self.min_match_percentage:
|
except MemoryError:
|
||||||
result.append(m)
|
# This is the place where the memory usage is at its peak during the scan.
|
||||||
if len(result) >= self.limit:
|
# Just continue the process with an incomplete list of matches.
|
||||||
return result
|
del compared # This should give us enough room to call logging.
|
||||||
j.add_progress(desc='%d matches found' % len(result))
|
logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
|
||||||
except MemoryError:
|
|
||||||
# This is the place where the memory usage is at its peak during the scan.
|
|
||||||
# Just continue the process with an incomplete list of matches.
|
|
||||||
del compared # This should give us enough room to call logging.
|
|
||||||
logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
|
|
||||||
return result
|
|
||||||
return result
|
return result
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
|
||||||
|
j = j.start_subjob([2, 8])
|
||||||
|
size2files = defaultdict(set)
|
||||||
|
for file in j.iter_with_progress(files, 'Read size of %d/%d files'):
|
||||||
|
size2files[getattr(file, sizeattr)].add(file)
|
||||||
|
possible_matches = [files for files in size2files.values() if len(files) > 1]
|
||||||
|
del size2files
|
||||||
|
result = []
|
||||||
|
j.start_job(len(possible_matches), '0 matches found')
|
||||||
|
for group in possible_matches:
|
||||||
|
for first, second in itertools.combinations(group, 2):
|
||||||
|
if first.md5partial == second.md5partial:
|
||||||
|
if partial or first.md5 == second.md5:
|
||||||
|
result.append(Match(first, second, 100))
|
||||||
|
j.add_progress(desc='%d matches found' % len(result))
|
||||||
|
return result
|
||||||
|
|
||||||
class Group(object):
|
class Group(object):
|
||||||
#---Override
|
#---Override
|
||||||
|
@ -32,40 +32,32 @@ class Scanner(object):
|
|||||||
self.ignore_list = IgnoreList()
|
self.ignore_list = IgnoreList()
|
||||||
self.discarded_file_count = 0
|
self.discarded_file_count = 0
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _filter_matches_by_content(matches, partial, j):
|
|
||||||
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
|
|
||||||
md5attrname = 'md5partial' if partial else 'md5'
|
|
||||||
md5 = lambda f: getattr(f, md5attrname)
|
|
||||||
for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
|
|
||||||
md5(matched_file)
|
|
||||||
j.set_progress(100, 'Removing false matches')
|
|
||||||
return [m for m in matches if md5(m.first) == md5(m.second)]
|
|
||||||
|
|
||||||
def _getmatches(self, files, j):
|
def _getmatches(self, files, j):
|
||||||
j = j.start_subjob(2)
|
if not self.size_threshold:
|
||||||
mf = engine.MatchFactory()
|
j = j.start_subjob([2, 8])
|
||||||
if self.scan_type != SCAN_TYPE_CONTENT:
|
for f in j.iter_with_progress(files, 'Read size of %d/%d files'):
|
||||||
mf.match_similar_words = self.match_similar_words
|
|
||||||
mf.weight_words = self.word_weighting
|
|
||||||
mf.min_match_percentage = self.min_match_percentage
|
|
||||||
if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
|
|
||||||
self.scan_type = SCAN_TYPE_FIELDS
|
|
||||||
mf.no_field_order = True
|
|
||||||
func = {
|
|
||||||
SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
|
|
||||||
SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
|
|
||||||
SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
|
|
||||||
SCAN_TYPE_CONTENT: lambda f: [str(f.size)],
|
|
||||||
SCAN_TYPE_CONTENT_AUDIO: lambda f: [str(f.audiosize)]
|
|
||||||
}[self.scan_type]
|
|
||||||
for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
|
|
||||||
if self.size_threshold:
|
|
||||||
f.size # pre-read, makes a smoother progress if read here (especially for bundles)
|
f.size # pre-read, makes a smoother progress if read here (especially for bundles)
|
||||||
f.words = func(f)
|
|
||||||
if self.size_threshold:
|
|
||||||
files = [f for f in files if f.size >= self.size_threshold]
|
files = [f for f in files if f.size >= self.size_threshold]
|
||||||
return mf.getmatches(files, j)
|
if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
|
||||||
|
sizeattr = 'size' if self.scan_type == SCAN_TYPE_CONTENT else 'audiosize'
|
||||||
|
return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==SCAN_TYPE_CONTENT_AUDIO, j=j)
|
||||||
|
else:
|
||||||
|
j = j.start_subjob([2, 8])
|
||||||
|
kw = {}
|
||||||
|
kw['match_similar_words'] = self.match_similar_words
|
||||||
|
kw['weight_words'] = self.word_weighting
|
||||||
|
kw['min_match_percentage'] = self.min_match_percentage
|
||||||
|
if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
|
||||||
|
self.scan_type = SCAN_TYPE_FIELDS
|
||||||
|
kw['no_field_order'] = True
|
||||||
|
func = {
|
||||||
|
SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
|
||||||
|
SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
|
||||||
|
SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
|
||||||
|
}[self.scan_type]
|
||||||
|
for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
|
||||||
|
f.words = func(f)
|
||||||
|
return engine.getmatches(files, j=j, **kw)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _key_func(dupe):
|
def _key_func(dupe):
|
||||||
@ -86,10 +78,7 @@ class Scanner(object):
|
|||||||
for f in [f for f in files if not hasattr(f, 'is_ref')]:
|
for f in [f for f in files if not hasattr(f, 'is_ref')]:
|
||||||
f.is_ref = False
|
f.is_ref = False
|
||||||
logging.info('Getting matches')
|
logging.info('Getting matches')
|
||||||
if self.match_factory is None:
|
matches = self._getmatches(files, j)
|
||||||
matches = self._getmatches(files, j)
|
|
||||||
else:
|
|
||||||
matches = self.match_factory.getmatches(files, j)
|
|
||||||
logging.info('Found %d matches' % len(matches))
|
logging.info('Found %d matches' % len(matches))
|
||||||
if not self.mix_file_kind:
|
if not self.mix_file_kind:
|
||||||
j.set_progress(100, 'Removing false matches')
|
j.set_progress(100, 'Removing false matches')
|
||||||
@ -99,14 +88,6 @@ class Scanner(object):
|
|||||||
iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
|
iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
|
||||||
matches = [m for m in iter_matches
|
matches = [m for m in iter_matches
|
||||||
if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
|
if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
|
||||||
if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
|
|
||||||
j = j.start_subjob(3 if self.scan_type == SCAN_TYPE_CONTENT else 2)
|
|
||||||
matches = self._filter_matches_by_content(matches, partial=True, j=j)
|
|
||||||
if self.scan_type == SCAN_TYPE_CONTENT:
|
|
||||||
matches = self._filter_matches_by_content(matches, partial=False, j=j)
|
|
||||||
# We compared md5. No words were involved.
|
|
||||||
for m in matches:
|
|
||||||
m.first.words = m.second.words = ['--']
|
|
||||||
logging.info('Grouping matches')
|
logging.info('Grouping matches')
|
||||||
groups = engine.get_groups(matches, j)
|
groups = engine.get_groups(matches, j)
|
||||||
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
|
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
|
||||||
|
@ -318,7 +318,7 @@ class TCDupeGuru_renameSelected(TestCase):
|
|||||||
fp = open(str(p + 'foo bar 3'),mode='w')
|
fp = open(str(p + 'foo bar 3'),mode='w')
|
||||||
fp.close()
|
fp.close()
|
||||||
refdir = hsfs.phys.Directory(None,str(p))
|
refdir = hsfs.phys.Directory(None,str(p))
|
||||||
matches = engine.MatchFactory().getmatches(refdir.files)
|
matches = engine.getmatches(refdir.files)
|
||||||
groups = engine.get_groups(matches)
|
groups = engine.get_groups(matches)
|
||||||
g = groups[0]
|
g = groups[0]
|
||||||
g.prioritize(lambda x:x.name)
|
g.prioritize(lambda x:x.name)
|
||||||
|
@ -340,21 +340,13 @@ class TCget_match(TestCase):
|
|||||||
self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage)
|
self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage)
|
||||||
|
|
||||||
|
|
||||||
class TCMatchFactory(TestCase):
|
class GetMatches(TestCase):
|
||||||
def test_empty(self):
|
def test_empty(self):
|
||||||
self.assertEqual([],MatchFactory().getmatches([]))
|
eq_(getmatches([]), [])
|
||||||
|
|
||||||
def test_defaults(self):
|
|
||||||
mf = MatchFactory()
|
|
||||||
self.assertEqual(50,mf.common_word_threshold)
|
|
||||||
self.assertEqual(False,mf.weight_words)
|
|
||||||
self.assertEqual(False,mf.match_similar_words)
|
|
||||||
self.assertEqual(False,mf.no_field_order)
|
|
||||||
self.assertEqual(0,mf.min_match_percentage)
|
|
||||||
|
|
||||||
def test_simple(self):
|
def test_simple(self):
|
||||||
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
|
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
|
||||||
r = MatchFactory().getmatches(l)
|
r = getmatches(l)
|
||||||
self.assertEqual(2,len(r))
|
self.assertEqual(2,len(r))
|
||||||
seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh"
|
seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh"
|
||||||
m = seek[0]
|
m = seek[0]
|
||||||
@ -367,7 +359,7 @@ class TCMatchFactory(TestCase):
|
|||||||
|
|
||||||
def test_null_and_unrelated_objects(self):
|
def test_null_and_unrelated_objects(self):
|
||||||
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")]
|
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")]
|
||||||
r = MatchFactory().getmatches(l)
|
r = getmatches(l)
|
||||||
self.assertEqual(1,len(r))
|
self.assertEqual(1,len(r))
|
||||||
m = r[0]
|
m = r[0]
|
||||||
self.assertEqual(50,m.percentage)
|
self.assertEqual(50,m.percentage)
|
||||||
@ -376,34 +368,33 @@ class TCMatchFactory(TestCase):
|
|||||||
|
|
||||||
def test_twice_the_same_word(self):
|
def test_twice_the_same_word(self):
|
||||||
l = [NamedObject("foo foo bar"),NamedObject("bar bleh")]
|
l = [NamedObject("foo foo bar"),NamedObject("bar bleh")]
|
||||||
r = MatchFactory().getmatches(l)
|
r = getmatches(l)
|
||||||
self.assertEqual(1,len(r))
|
self.assertEqual(1,len(r))
|
||||||
|
|
||||||
def test_twice_the_same_word_when_preworded(self):
|
def test_twice_the_same_word_when_preworded(self):
|
||||||
l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)]
|
l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)]
|
||||||
r = MatchFactory().getmatches(l)
|
r = getmatches(l)
|
||||||
self.assertEqual(1,len(r))
|
self.assertEqual(1,len(r))
|
||||||
|
|
||||||
def test_two_words_match(self):
|
def test_two_words_match(self):
|
||||||
l = [NamedObject("foo bar"),NamedObject("foo bar bleh")]
|
l = [NamedObject("foo bar"),NamedObject("foo bar bleh")]
|
||||||
r = MatchFactory().getmatches(l)
|
r = getmatches(l)
|
||||||
self.assertEqual(1,len(r))
|
self.assertEqual(1,len(r))
|
||||||
|
|
||||||
def test_match_files_with_only_common_words(self):
|
def test_match_files_with_only_common_words(self):
|
||||||
#If a word occurs more than 50 times, it is excluded from the matching process
|
#If a word occurs more than 50 times, it is excluded from the matching process
|
||||||
#The problem with the common_word_threshold is that the files containing only common
|
#The problem with the common_word_threshold is that the files containing only common
|
||||||
#words will never be matched together. We *should* match them.
|
#words will never be matched together. We *should* match them.
|
||||||
mf = MatchFactory()
|
# This test assumes that the common word threashold const is 50
|
||||||
mf.common_word_threshold = 50
|
|
||||||
l = [NamedObject("foo") for i in range(50)]
|
l = [NamedObject("foo") for i in range(50)]
|
||||||
r = mf.getmatches(l)
|
r = getmatches(l)
|
||||||
self.assertEqual(1225,len(r))
|
self.assertEqual(1225,len(r))
|
||||||
|
|
||||||
def test_use_words_already_there_if_there(self):
|
def test_use_words_already_there_if_there(self):
|
||||||
o1 = NamedObject('foo')
|
o1 = NamedObject('foo')
|
||||||
o2 = NamedObject('bar')
|
o2 = NamedObject('bar')
|
||||||
o2.words = ['foo']
|
o2.words = ['foo']
|
||||||
self.assertEqual(1,len(MatchFactory().getmatches([o1,o2])))
|
eq_(1, len(getmatches([o1,o2])))
|
||||||
|
|
||||||
def test_job(self):
|
def test_job(self):
|
||||||
def do_progress(p,d=''):
|
def do_progress(p,d=''):
|
||||||
@ -413,75 +404,62 @@ class TCMatchFactory(TestCase):
|
|||||||
j = job.Job(1,do_progress)
|
j = job.Job(1,do_progress)
|
||||||
self.log = []
|
self.log = []
|
||||||
s = "foo bar"
|
s = "foo bar"
|
||||||
MatchFactory().getmatches([NamedObject(s),NamedObject(s),NamedObject(s)],j)
|
getmatches([NamedObject(s), NamedObject(s), NamedObject(s)], j=j)
|
||||||
self.assert_(len(self.log) > 2)
|
self.assert_(len(self.log) > 2)
|
||||||
self.assertEqual(0,self.log[0])
|
self.assertEqual(0,self.log[0])
|
||||||
self.assertEqual(100,self.log[-1])
|
self.assertEqual(100,self.log[-1])
|
||||||
|
|
||||||
def test_weight_words(self):
|
def test_weight_words(self):
|
||||||
mf = MatchFactory()
|
|
||||||
mf.weight_words = True
|
|
||||||
l = [NamedObject("foo bar"),NamedObject("bar bleh")]
|
l = [NamedObject("foo bar"),NamedObject("bar bleh")]
|
||||||
m = mf.getmatches(l)[0]
|
m = getmatches(l, weight_words=True)[0]
|
||||||
self.assertEqual(int((6.0 / 13.0) * 100),m.percentage)
|
self.assertEqual(int((6.0 / 13.0) * 100),m.percentage)
|
||||||
|
|
||||||
def test_similar_word(self):
|
def test_similar_word(self):
|
||||||
mf = MatchFactory()
|
|
||||||
mf.match_similar_words = True
|
|
||||||
l = [NamedObject("foobar"),NamedObject("foobars")]
|
l = [NamedObject("foobar"),NamedObject("foobars")]
|
||||||
self.assertEqual(1,len(mf.getmatches(l)))
|
eq_(len(getmatches(l, match_similar_words=True)), 1)
|
||||||
self.assertEqual(100,mf.getmatches(l)[0].percentage)
|
eq_(getmatches(l, match_similar_words=True)[0].percentage, 100)
|
||||||
l = [NamedObject("foobar"),NamedObject("foo")]
|
l = [NamedObject("foobar"),NamedObject("foo")]
|
||||||
self.assertEqual(0,len(mf.getmatches(l))) #too far
|
eq_(len(getmatches(l, match_similar_words=True)), 0) #too far
|
||||||
l = [NamedObject("bizkit"),NamedObject("bizket")]
|
l = [NamedObject("bizkit"),NamedObject("bizket")]
|
||||||
self.assertEqual(1,len(mf.getmatches(l)))
|
eq_(len(getmatches(l, match_similar_words=True)), 1)
|
||||||
l = [NamedObject("foobar"),NamedObject("foosbar")]
|
l = [NamedObject("foobar"),NamedObject("foosbar")]
|
||||||
self.assertEqual(1,len(mf.getmatches(l)))
|
eq_(len(getmatches(l, match_similar_words=True)), 1)
|
||||||
|
|
||||||
def test_single_object_with_similar_words(self):
|
def test_single_object_with_similar_words(self):
|
||||||
mf = MatchFactory()
|
|
||||||
mf.match_similar_words = True
|
|
||||||
l = [NamedObject("foo foos")]
|
l = [NamedObject("foo foos")]
|
||||||
self.assertEqual(0,len(mf.getmatches(l)))
|
eq_(len(getmatches(l, match_similar_words=True)), 0)
|
||||||
|
|
||||||
def test_double_words_get_counted_only_once(self):
|
def test_double_words_get_counted_only_once(self):
|
||||||
mf = MatchFactory()
|
|
||||||
l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")]
|
l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")]
|
||||||
m = mf.getmatches(l)[0]
|
m = getmatches(l)[0]
|
||||||
self.assertEqual(75,m.percentage)
|
self.assertEqual(75,m.percentage)
|
||||||
|
|
||||||
def test_with_fields(self):
|
def test_with_fields(self):
|
||||||
mf = MatchFactory()
|
|
||||||
o1 = NamedObject("foo bar - foo bleh")
|
o1 = NamedObject("foo bar - foo bleh")
|
||||||
o2 = NamedObject("foo bar - bleh bar")
|
o2 = NamedObject("foo bar - bleh bar")
|
||||||
o1.words = getfields(o1.name)
|
o1.words = getfields(o1.name)
|
||||||
o2.words = getfields(o2.name)
|
o2.words = getfields(o2.name)
|
||||||
m = mf.getmatches([o1, o2])[0]
|
m = getmatches([o1, o2])[0]
|
||||||
self.assertEqual(50, m.percentage)
|
self.assertEqual(50, m.percentage)
|
||||||
|
|
||||||
def test_with_fields_no_order(self):
|
def test_with_fields_no_order(self):
|
||||||
mf = MatchFactory()
|
|
||||||
mf.no_field_order = True
|
|
||||||
o1 = NamedObject("foo bar - foo bleh")
|
o1 = NamedObject("foo bar - foo bleh")
|
||||||
o2 = NamedObject("bleh bang - foo bar")
|
o2 = NamedObject("bleh bang - foo bar")
|
||||||
o1.words = getfields(o1.name)
|
o1.words = getfields(o1.name)
|
||||||
o2.words = getfields(o2.name)
|
o2.words = getfields(o2.name)
|
||||||
m = mf.getmatches([o1, o2])[0]
|
m = getmatches([o1, o2], no_field_order=True)[0]
|
||||||
self.assertEqual(50 ,m.percentage)
|
eq_(m.percentage, 50)
|
||||||
|
|
||||||
def test_only_match_similar_when_the_option_is_set(self):
|
def test_only_match_similar_when_the_option_is_set(self):
|
||||||
mf = MatchFactory()
|
|
||||||
mf.match_similar_words = False
|
|
||||||
l = [NamedObject("foobar"),NamedObject("foobars")]
|
l = [NamedObject("foobar"),NamedObject("foobars")]
|
||||||
self.assertEqual(0,len(mf.getmatches(l)))
|
eq_(len(getmatches(l, match_similar_words=False)), 0)
|
||||||
|
|
||||||
def test_dont_recurse_do_match(self):
|
def test_dont_recurse_do_match(self):
|
||||||
# with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
|
# with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
|
||||||
sys.setrecursionlimit(100)
|
sys.setrecursionlimit(100)
|
||||||
mf = MatchFactory()
|
|
||||||
files = [NamedObject('foo bar') for i in range(101)]
|
files = [NamedObject('foo bar') for i in range(101)]
|
||||||
try:
|
try:
|
||||||
mf.getmatches(files)
|
getmatches(files)
|
||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
self.fail()
|
self.fail()
|
||||||
finally:
|
finally:
|
||||||
@ -489,18 +467,9 @@ class TCMatchFactory(TestCase):
|
|||||||
|
|
||||||
def test_min_match_percentage(self):
|
def test_min_match_percentage(self):
|
||||||
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
|
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
|
||||||
mf = MatchFactory()
|
r = getmatches(l, min_match_percentage=50)
|
||||||
mf.min_match_percentage = 50
|
|
||||||
r = mf.getmatches(l)
|
|
||||||
self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match
|
self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match
|
||||||
|
|
||||||
def test_limit(self):
|
|
||||||
l = [NamedObject(),NamedObject(),NamedObject()]
|
|
||||||
mf = MatchFactory()
|
|
||||||
mf.limit = 2
|
|
||||||
r = mf.getmatches(l)
|
|
||||||
self.assertEqual(2,len(r))
|
|
||||||
|
|
||||||
def test_MemoryError(self):
|
def test_MemoryError(self):
|
||||||
@log_calls
|
@log_calls
|
||||||
def mocked_match(first, second, flags):
|
def mocked_match(first, second, flags):
|
||||||
@ -510,9 +479,8 @@ class TCMatchFactory(TestCase):
|
|||||||
|
|
||||||
objects = [NamedObject() for i in range(10)] # results in 45 matches
|
objects = [NamedObject() for i in range(10)] # results in 45 matches
|
||||||
self.mock(engine, 'get_match', mocked_match)
|
self.mock(engine, 'get_match', mocked_match)
|
||||||
mf = MatchFactory()
|
|
||||||
try:
|
try:
|
||||||
r = mf.getmatches(objects)
|
r = getmatches(objects)
|
||||||
except MemoryError:
|
except MemoryError:
|
||||||
self.fail('MemorryError must be handled')
|
self.fail('MemorryError must be handled')
|
||||||
self.assertEqual(42, len(r))
|
self.assertEqual(42, len(r))
|
||||||
@ -738,7 +706,7 @@ class TCget_groups(TestCase):
|
|||||||
|
|
||||||
def test_simple(self):
|
def test_simple(self):
|
||||||
l = [NamedObject("foo bar"),NamedObject("bar bleh")]
|
l = [NamedObject("foo bar"),NamedObject("bar bleh")]
|
||||||
matches = MatchFactory().getmatches(l)
|
matches = getmatches(l)
|
||||||
m = matches[0]
|
m = matches[0]
|
||||||
r = get_groups(matches)
|
r = get_groups(matches)
|
||||||
self.assertEqual(1,len(r))
|
self.assertEqual(1,len(r))
|
||||||
@ -749,7 +717,7 @@ class TCget_groups(TestCase):
|
|||||||
def test_group_with_multiple_matches(self):
|
def test_group_with_multiple_matches(self):
|
||||||
#This results in 3 matches
|
#This results in 3 matches
|
||||||
l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")]
|
l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")]
|
||||||
matches = MatchFactory().getmatches(l)
|
matches = getmatches(l)
|
||||||
r = get_groups(matches)
|
r = get_groups(matches)
|
||||||
self.assertEqual(1,len(r))
|
self.assertEqual(1,len(r))
|
||||||
g = r[0]
|
g = r[0]
|
||||||
@ -759,7 +727,7 @@ class TCget_groups(TestCase):
|
|||||||
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")]
|
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")]
|
||||||
#There will be 2 groups here: group "a b" and group "c d"
|
#There will be 2 groups here: group "a b" and group "c d"
|
||||||
#"b c" can go either of them, but not both.
|
#"b c" can go either of them, but not both.
|
||||||
matches = MatchFactory().getmatches(l)
|
matches = getmatches(l)
|
||||||
r = get_groups(matches)
|
r = get_groups(matches)
|
||||||
self.assertEqual(2,len(r))
|
self.assertEqual(2,len(r))
|
||||||
self.assertEqual(5,len(r[0])+len(r[1]))
|
self.assertEqual(5,len(r[0])+len(r[1]))
|
||||||
@ -768,7 +736,7 @@ class TCget_groups(TestCase):
|
|||||||
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")]
|
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")]
|
||||||
#There will be 2 groups here: group "a b" and group "c d"
|
#There will be 2 groups here: group "a b" and group "c d"
|
||||||
#"b c" can fit in both, but it must be in only one of them
|
#"b c" can fit in both, but it must be in only one of them
|
||||||
matches = MatchFactory().getmatches(l)
|
matches = getmatches(l)
|
||||||
r = get_groups(matches)
|
r = get_groups(matches)
|
||||||
self.assertEqual(1,len(r))
|
self.assertEqual(1,len(r))
|
||||||
|
|
||||||
@ -788,7 +756,7 @@ class TCget_groups(TestCase):
|
|||||||
|
|
||||||
def test_four_sized_group(self):
|
def test_four_sized_group(self):
|
||||||
l = [NamedObject("foobar") for i in xrange(4)]
|
l = [NamedObject("foobar") for i in xrange(4)]
|
||||||
m = MatchFactory().getmatches(l)
|
m = getmatches(l)
|
||||||
r = get_groups(m)
|
r = get_groups(m)
|
||||||
self.assertEqual(1,len(r))
|
self.assertEqual(1,len(r))
|
||||||
self.assertEqual(4,len(r[0]))
|
self.assertEqual(4,len(r[0]))
|
||||||
|
@ -37,7 +37,7 @@ class NamedObject(engine_test.NamedObject):
|
|||||||
def GetTestGroups():
|
def GetTestGroups():
|
||||||
objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")]
|
objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")]
|
||||||
objects[1].size = 1024
|
objects[1].size = 1024
|
||||||
matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
|
matches = engine.getmatches(objects) #we should have 5 matches
|
||||||
groups = engine.get_groups(matches) #We should have 2 groups
|
groups = engine.get_groups(matches) #We should have 2 groups
|
||||||
for g in groups:
|
for g in groups:
|
||||||
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
|
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
|
||||||
@ -505,7 +505,7 @@ class TCResultsXML(TestCase):
|
|||||||
return objects[1]
|
return objects[1]
|
||||||
|
|
||||||
objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)]
|
objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)]
|
||||||
matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
|
matches = engine.getmatches(objects) #we should have 5 matches
|
||||||
groups = engine.get_groups(matches) #We should have 2 groups
|
groups = engine.get_groups(matches) #We should have 2 groups
|
||||||
for g in groups:
|
for g in groups:
|
||||||
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
|
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
|
||||||
|
@ -369,23 +369,6 @@ def test_ignore_list_checks_for_unicode():
|
|||||||
assert f2 in g
|
assert f2 in g
|
||||||
assert f3 in g
|
assert f3 in g
|
||||||
|
|
||||||
def test_custom_match_factory():
|
|
||||||
class MatchFactory(object):
|
|
||||||
def getmatches(self, objects, j=None):
|
|
||||||
return [Match(objects[0], objects[1], 420)]
|
|
||||||
|
|
||||||
|
|
||||||
s = Scanner()
|
|
||||||
s.match_factory = MatchFactory()
|
|
||||||
o1, o2 = no('foo'), no('bar')
|
|
||||||
groups = s.GetDupeGroups([o1, o2])
|
|
||||||
eq_(len(groups), 1)
|
|
||||||
g = groups[0]
|
|
||||||
eq_(len(g), 2)
|
|
||||||
g.switch_ref(o1)
|
|
||||||
m = g.get_match_of(o2)
|
|
||||||
eq_(m, (o1, o2, 420))
|
|
||||||
|
|
||||||
def test_file_evaluates_to_false():
|
def test_file_evaluates_to_false():
|
||||||
# A very wrong way to use any() was added at some point, causing resulting group list
|
# A very wrong way to use any() was added at some point, causing resulting group list
|
||||||
# to be empty.
|
# to be empty.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user