Changed the MatchFactory into a simple getmatch method, and added a separate getmatches_by_contents() method for contents scan, which results in faster and less memory hungry scans.

--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40192
This commit is contained in:
hsoft 2009-10-18 08:46:00 +00:00
parent 3f34dab881
commit 7228adf433
8 changed files with 123 additions and 179 deletions

View File

@ -79,7 +79,7 @@ def GetDisplayInfo(dupe, group, delta):
format_timestamp(ctime, delta and m),
format_timestamp(mtime, delta and m),
format_perc(percentage),
format_words(dupe.words),
format_words(dupe.words) if hasattr(dupe, 'words') else '',
format_dupe_count(dupe_count)
]

View File

@ -76,7 +76,7 @@ def GetDisplayInfo(dupe, group, delta):
str(dupe.track),
dupe.comment,
format_perc(percentage),
format_words(dupe.words),
format_words(dupe.words) if hasattr(dupe, 'words') else '',
format_dupe_count(dupe_count)
]

View File

@ -9,6 +9,7 @@
from __future__ import division
import difflib
import itertools
import logging
import string
from collections import defaultdict, namedtuple
@ -156,58 +157,69 @@ def get_match(first, second, flags=()):
percentage = compare(first.words, second.words, flags)
return Match(first, second, percentage)
class MatchFactory(object):
common_word_threshold = 50
match_similar_words = False
min_match_percentage = 0
weight_words = False
no_field_order = False
limit = 5000000
def getmatches(self, objects, j=job.nulljob):
j = j.start_subjob(2)
sj = j.start_subjob(2)
for o in objects:
if not hasattr(o, 'words'):
o.words = getwords(o.name)
word_dict = build_word_dict(objects, sj)
reduce_common_words(word_dict, self.common_word_threshold)
if self.match_similar_words:
merge_similar_words(word_dict)
match_flags = []
if self.weight_words:
match_flags.append(WEIGHT_WORDS)
if self.match_similar_words:
match_flags.append(MATCH_SIMILAR_WORDS)
if self.no_field_order:
match_flags.append(NO_FIELD_ORDER)
j.start_job(len(word_dict), '0 matches found')
compared = defaultdict(set)
result = []
try:
# This whole 'popping' thing is there to avoid taking too much memory at the same time.
while word_dict:
items = word_dict.popitem()[1]
while items:
ref = items.pop()
compared_already = compared[ref]
to_compare = items - compared_already
compared_already |= to_compare
for other in to_compare:
m = get_match(ref, other, match_flags)
if m.percentage >= self.min_match_percentage:
result.append(m)
if len(result) >= self.limit:
return result
j.add_progress(desc='%d matches found' % len(result))
except MemoryError:
# This is the place where the memory usage is at its peak during the scan.
# Just continue the process with an incomplete list of matches.
del compared # This should give us enough room to call logging.
logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
return result
def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False,
no_field_order=False, j=job.nulljob):
COMMON_WORD_THRESHOLD = 50
LIMIT = 5000000
j = j.start_subjob(2)
sj = j.start_subjob(2)
for o in objects:
if not hasattr(o, 'words'):
o.words = getwords(o.name)
word_dict = build_word_dict(objects, sj)
reduce_common_words(word_dict, COMMON_WORD_THRESHOLD)
if match_similar_words:
merge_similar_words(word_dict)
match_flags = []
if weight_words:
match_flags.append(WEIGHT_WORDS)
if match_similar_words:
match_flags.append(MATCH_SIMILAR_WORDS)
if no_field_order:
match_flags.append(NO_FIELD_ORDER)
j.start_job(len(word_dict), '0 matches found')
compared = defaultdict(set)
result = []
try:
# This whole 'popping' thing is there to avoid taking too much memory at the same time.
while word_dict:
items = word_dict.popitem()[1]
while items:
ref = items.pop()
compared_already = compared[ref]
to_compare = items - compared_already
compared_already |= to_compare
for other in to_compare:
m = get_match(ref, other, match_flags)
if m.percentage >= min_match_percentage:
result.append(m)
if len(result) >= LIMIT:
return result
j.add_progress(desc='%d matches found' % len(result))
except MemoryError:
# This is the place where the memory usage is at its peak during the scan.
# Just continue the process with an incomplete list of matches.
del compared # This should give us enough room to call logging.
logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
return result
return result
def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
j = j.start_subjob([2, 8])
size2files = defaultdict(set)
for file in j.iter_with_progress(files, 'Read size of %d/%d files'):
size2files[getattr(file, sizeattr)].add(file)
possible_matches = [files for files in size2files.values() if len(files) > 1]
del size2files
result = []
j.start_job(len(possible_matches), '0 matches found')
for group in possible_matches:
for first, second in itertools.combinations(group, 2):
if first.md5partial == second.md5partial:
if partial or first.md5 == second.md5:
result.append(Match(first, second, 100))
j.add_progress(desc='%d matches found' % len(result))
return result
class Group(object):
#---Override

View File

@ -32,40 +32,32 @@ class Scanner(object):
self.ignore_list = IgnoreList()
self.discarded_file_count = 0
@staticmethod
def _filter_matches_by_content(matches, partial, j):
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
md5attrname = 'md5partial' if partial else 'md5'
md5 = lambda f: getattr(f, md5attrname)
for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
md5(matched_file)
j.set_progress(100, 'Removing false matches')
return [m for m in matches if md5(m.first) == md5(m.second)]
def _getmatches(self, files, j):
j = j.start_subjob(2)
mf = engine.MatchFactory()
if self.scan_type != SCAN_TYPE_CONTENT:
mf.match_similar_words = self.match_similar_words
mf.weight_words = self.word_weighting
mf.min_match_percentage = self.min_match_percentage
if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
self.scan_type = SCAN_TYPE_FIELDS
mf.no_field_order = True
func = {
SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
SCAN_TYPE_CONTENT: lambda f: [str(f.size)],
SCAN_TYPE_CONTENT_AUDIO: lambda f: [str(f.audiosize)]
}[self.scan_type]
for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
if self.size_threshold:
if not self.size_threshold:
j = j.start_subjob([2, 8])
for f in j.iter_with_progress(files, 'Read size of %d/%d files'):
f.size # pre-read, makes a smoother progress if read here (especially for bundles)
f.words = func(f)
if self.size_threshold:
files = [f for f in files if f.size >= self.size_threshold]
return mf.getmatches(files, j)
if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
sizeattr = 'size' if self.scan_type == SCAN_TYPE_CONTENT else 'audiosize'
return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==SCAN_TYPE_CONTENT_AUDIO, j=j)
else:
j = j.start_subjob([2, 8])
kw = {}
kw['match_similar_words'] = self.match_similar_words
kw['weight_words'] = self.word_weighting
kw['min_match_percentage'] = self.min_match_percentage
if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
self.scan_type = SCAN_TYPE_FIELDS
kw['no_field_order'] = True
func = {
SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
}[self.scan_type]
for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
f.words = func(f)
return engine.getmatches(files, j=j, **kw)
@staticmethod
def _key_func(dupe):
@ -86,10 +78,7 @@ class Scanner(object):
for f in [f for f in files if not hasattr(f, 'is_ref')]:
f.is_ref = False
logging.info('Getting matches')
if self.match_factory is None:
matches = self._getmatches(files, j)
else:
matches = self.match_factory.getmatches(files, j)
matches = self._getmatches(files, j)
logging.info('Found %d matches' % len(matches))
if not self.mix_file_kind:
j.set_progress(100, 'Removing false matches')
@ -99,14 +88,6 @@ class Scanner(object):
iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
matches = [m for m in iter_matches
if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
j = j.start_subjob(3 if self.scan_type == SCAN_TYPE_CONTENT else 2)
matches = self._filter_matches_by_content(matches, partial=True, j=j)
if self.scan_type == SCAN_TYPE_CONTENT:
matches = self._filter_matches_by_content(matches, partial=False, j=j)
# We compared md5. No words were involved.
for m in matches:
m.first.words = m.second.words = ['--']
logging.info('Grouping matches')
groups = engine.get_groups(matches, j)
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])

View File

@ -318,7 +318,7 @@ class TCDupeGuru_renameSelected(TestCase):
fp = open(str(p + 'foo bar 3'),mode='w')
fp.close()
refdir = hsfs.phys.Directory(None,str(p))
matches = engine.MatchFactory().getmatches(refdir.files)
matches = engine.getmatches(refdir.files)
groups = engine.get_groups(matches)
g = groups[0]
g.prioritize(lambda x:x.name)

View File

@ -340,21 +340,13 @@ class TCget_match(TestCase):
self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage)
class TCMatchFactory(TestCase):
class GetMatches(TestCase):
def test_empty(self):
self.assertEqual([],MatchFactory().getmatches([]))
def test_defaults(self):
mf = MatchFactory()
self.assertEqual(50,mf.common_word_threshold)
self.assertEqual(False,mf.weight_words)
self.assertEqual(False,mf.match_similar_words)
self.assertEqual(False,mf.no_field_order)
self.assertEqual(0,mf.min_match_percentage)
eq_(getmatches([]), [])
def test_simple(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
r = MatchFactory().getmatches(l)
r = getmatches(l)
self.assertEqual(2,len(r))
seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh"
m = seek[0]
@ -367,7 +359,7 @@ class TCMatchFactory(TestCase):
def test_null_and_unrelated_objects(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")]
r = MatchFactory().getmatches(l)
r = getmatches(l)
self.assertEqual(1,len(r))
m = r[0]
self.assertEqual(50,m.percentage)
@ -376,34 +368,33 @@ class TCMatchFactory(TestCase):
def test_twice_the_same_word(self):
l = [NamedObject("foo foo bar"),NamedObject("bar bleh")]
r = MatchFactory().getmatches(l)
r = getmatches(l)
self.assertEqual(1,len(r))
def test_twice_the_same_word_when_preworded(self):
l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)]
r = MatchFactory().getmatches(l)
r = getmatches(l)
self.assertEqual(1,len(r))
def test_two_words_match(self):
l = [NamedObject("foo bar"),NamedObject("foo bar bleh")]
r = MatchFactory().getmatches(l)
r = getmatches(l)
self.assertEqual(1,len(r))
def test_match_files_with_only_common_words(self):
#If a word occurs more than 50 times, it is excluded from the matching process
#The problem with the common_word_threshold is that the files containing only common
#words will never be matched together. We *should* match them.
mf = MatchFactory()
mf.common_word_threshold = 50
# This test assumes that the common word threashold const is 50
l = [NamedObject("foo") for i in range(50)]
r = mf.getmatches(l)
r = getmatches(l)
self.assertEqual(1225,len(r))
def test_use_words_already_there_if_there(self):
o1 = NamedObject('foo')
o2 = NamedObject('bar')
o2.words = ['foo']
self.assertEqual(1,len(MatchFactory().getmatches([o1,o2])))
eq_(1, len(getmatches([o1,o2])))
def test_job(self):
def do_progress(p,d=''):
@ -413,75 +404,62 @@ class TCMatchFactory(TestCase):
j = job.Job(1,do_progress)
self.log = []
s = "foo bar"
MatchFactory().getmatches([NamedObject(s),NamedObject(s),NamedObject(s)],j)
getmatches([NamedObject(s), NamedObject(s), NamedObject(s)], j=j)
self.assert_(len(self.log) > 2)
self.assertEqual(0,self.log[0])
self.assertEqual(100,self.log[-1])
def test_weight_words(self):
mf = MatchFactory()
mf.weight_words = True
l = [NamedObject("foo bar"),NamedObject("bar bleh")]
m = mf.getmatches(l)[0]
m = getmatches(l, weight_words=True)[0]
self.assertEqual(int((6.0 / 13.0) * 100),m.percentage)
def test_similar_word(self):
mf = MatchFactory()
mf.match_similar_words = True
l = [NamedObject("foobar"),NamedObject("foobars")]
self.assertEqual(1,len(mf.getmatches(l)))
self.assertEqual(100,mf.getmatches(l)[0].percentage)
eq_(len(getmatches(l, match_similar_words=True)), 1)
eq_(getmatches(l, match_similar_words=True)[0].percentage, 100)
l = [NamedObject("foobar"),NamedObject("foo")]
self.assertEqual(0,len(mf.getmatches(l))) #too far
eq_(len(getmatches(l, match_similar_words=True)), 0) #too far
l = [NamedObject("bizkit"),NamedObject("bizket")]
self.assertEqual(1,len(mf.getmatches(l)))
eq_(len(getmatches(l, match_similar_words=True)), 1)
l = [NamedObject("foobar"),NamedObject("foosbar")]
self.assertEqual(1,len(mf.getmatches(l)))
eq_(len(getmatches(l, match_similar_words=True)), 1)
def test_single_object_with_similar_words(self):
mf = MatchFactory()
mf.match_similar_words = True
l = [NamedObject("foo foos")]
self.assertEqual(0,len(mf.getmatches(l)))
eq_(len(getmatches(l, match_similar_words=True)), 0)
def test_double_words_get_counted_only_once(self):
mf = MatchFactory()
l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")]
m = mf.getmatches(l)[0]
m = getmatches(l)[0]
self.assertEqual(75,m.percentage)
def test_with_fields(self):
mf = MatchFactory()
o1 = NamedObject("foo bar - foo bleh")
o2 = NamedObject("foo bar - bleh bar")
o1.words = getfields(o1.name)
o2.words = getfields(o2.name)
m = mf.getmatches([o1, o2])[0]
m = getmatches([o1, o2])[0]
self.assertEqual(50, m.percentage)
def test_with_fields_no_order(self):
mf = MatchFactory()
mf.no_field_order = True
o1 = NamedObject("foo bar - foo bleh")
o2 = NamedObject("bleh bang - foo bar")
o1.words = getfields(o1.name)
o2.words = getfields(o2.name)
m = mf.getmatches([o1, o2])[0]
self.assertEqual(50 ,m.percentage)
m = getmatches([o1, o2], no_field_order=True)[0]
eq_(m.percentage, 50)
def test_only_match_similar_when_the_option_is_set(self):
mf = MatchFactory()
mf.match_similar_words = False
l = [NamedObject("foobar"),NamedObject("foobars")]
self.assertEqual(0,len(mf.getmatches(l)))
eq_(len(getmatches(l, match_similar_words=False)), 0)
def test_dont_recurse_do_match(self):
# with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
sys.setrecursionlimit(100)
mf = MatchFactory()
files = [NamedObject('foo bar') for i in range(101)]
try:
mf.getmatches(files)
getmatches(files)
except RuntimeError:
self.fail()
finally:
@ -489,18 +467,9 @@ class TCMatchFactory(TestCase):
def test_min_match_percentage(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
mf = MatchFactory()
mf.min_match_percentage = 50
r = mf.getmatches(l)
r = getmatches(l, min_match_percentage=50)
self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match
def test_limit(self):
l = [NamedObject(),NamedObject(),NamedObject()]
mf = MatchFactory()
mf.limit = 2
r = mf.getmatches(l)
self.assertEqual(2,len(r))
def test_MemoryError(self):
@log_calls
def mocked_match(first, second, flags):
@ -510,9 +479,8 @@ class TCMatchFactory(TestCase):
objects = [NamedObject() for i in range(10)] # results in 45 matches
self.mock(engine, 'get_match', mocked_match)
mf = MatchFactory()
try:
r = mf.getmatches(objects)
r = getmatches(objects)
except MemoryError:
self.fail('MemorryError must be handled')
self.assertEqual(42, len(r))
@ -738,7 +706,7 @@ class TCget_groups(TestCase):
def test_simple(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh")]
matches = MatchFactory().getmatches(l)
matches = getmatches(l)
m = matches[0]
r = get_groups(matches)
self.assertEqual(1,len(r))
@ -749,7 +717,7 @@ class TCget_groups(TestCase):
def test_group_with_multiple_matches(self):
#This results in 3 matches
l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")]
matches = MatchFactory().getmatches(l)
matches = getmatches(l)
r = get_groups(matches)
self.assertEqual(1,len(r))
g = r[0]
@ -759,7 +727,7 @@ class TCget_groups(TestCase):
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")]
#There will be 2 groups here: group "a b" and group "c d"
#"b c" can go either of them, but not both.
matches = MatchFactory().getmatches(l)
matches = getmatches(l)
r = get_groups(matches)
self.assertEqual(2,len(r))
self.assertEqual(5,len(r[0])+len(r[1]))
@ -768,7 +736,7 @@ class TCget_groups(TestCase):
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")]
#There will be 2 groups here: group "a b" and group "c d"
#"b c" can fit in both, but it must be in only one of them
matches = MatchFactory().getmatches(l)
matches = getmatches(l)
r = get_groups(matches)
self.assertEqual(1,len(r))
@ -788,7 +756,7 @@ class TCget_groups(TestCase):
def test_four_sized_group(self):
l = [NamedObject("foobar") for i in xrange(4)]
m = MatchFactory().getmatches(l)
m = getmatches(l)
r = get_groups(m)
self.assertEqual(1,len(r))
self.assertEqual(4,len(r[0]))

View File

@ -37,7 +37,7 @@ class NamedObject(engine_test.NamedObject):
def GetTestGroups():
objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")]
objects[1].size = 1024
matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
matches = engine.getmatches(objects) #we should have 5 matches
groups = engine.get_groups(matches) #We should have 2 groups
for g in groups:
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
@ -505,7 +505,7 @@ class TCResultsXML(TestCase):
return objects[1]
objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)]
matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
matches = engine.getmatches(objects) #we should have 5 matches
groups = engine.get_groups(matches) #We should have 2 groups
for g in groups:
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is

View File

@ -369,23 +369,6 @@ def test_ignore_list_checks_for_unicode():
assert f2 in g
assert f3 in g
def test_custom_match_factory():
class MatchFactory(object):
def getmatches(self, objects, j=None):
return [Match(objects[0], objects[1], 420)]
s = Scanner()
s.match_factory = MatchFactory()
o1, o2 = no('foo'), no('bar')
groups = s.GetDupeGroups([o1, o2])
eq_(len(groups), 1)
g = groups[0]
eq_(len(g), 2)
g.switch_ref(o1)
m = g.get_match_of(o2)
eq_(m, (o1, o2, 420))
def test_file_evaluates_to_false():
# A very wrong way to use any() was added at some point, causing resulting group list
# to be empty.