# Created By: Virgil Dupras # Created On: 2006/01/29 # Copyright 2010 Hardcoded Software (http://www.hardcoded.net) # # This software is licensed under the "BSD" License as described in the "LICENSE" file, # which should be included with this package. The terms are also available at # http://www.hardcoded.net/licenses/bsd_license import sys from jobprogress import job from hsutil.decorators import log_calls from hsutil.misc import first from hsutil.testutil import eq_ from hsutil.testcase import TestCase from .. import engine from ..engine import * class NamedObject(object): def __init__(self, name="foobar", with_words=False, size=1): self.name = name self.size = size self.md5partial = name self.md5 = name if with_words: self.words = getwords(name) no = NamedObject def get_match_triangle(): o1 = NamedObject(with_words=True) o2 = NamedObject(with_words=True) o3 = NamedObject(with_words=True) m1 = get_match(o1,o2) m2 = get_match(o1,o3) m3 = get_match(o2,o3) return [m1, m2, m3] def get_test_group(): m1, m2, m3 = get_match_triangle() result = Group() result.add_match(m1) result.add_match(m2) result.add_match(m3) return result def assert_match(m, name1, name2): # When testing matches, whether objects are in first or second position very often doesn't # matter. This function makes this test more convenient. if m.first.name == name1: eq_(m.second.name, name2) else: eq_(m.first.name, name2) eq_(m.second.name, name1) class TCgetwords(TestCase): def test_spaces(self): self.assertEqual(['a', 'b', 'c', 'd'], getwords("a b c d")) self.assertEqual(['a', 'b', 'c', 'd'], getwords(" a b c d ")) def test_splitter_chars(self): self.assertEqual( [chr(i) for i in range(ord('a'),ord('z')+1)], getwords("a-b_c&d+e(f)g;h\\i[j]k{l}m:n.o,pr/s?t~u!v@w#x$y*z") ) def test_joiner_chars(self): self.assertEqual(["aec"], getwords("a'e\u0301c")) def test_empty(self): self.assertEqual([], getwords('')) def test_returns_lowercase(self): self.assertEqual(['foo', 'bar'], getwords('FOO BAR')) def test_decompose_unicode(self): self.assertEqual(getwords('foo\xe9bar'), ['fooebar']) class TCgetfields(TestCase): def test_simple(self): self.assertEqual([['a', 'b'], ['c', 'd', 'e']], getfields('a b - c d e')) def test_empty(self): self.assertEqual([], getfields('')) def test_cleans_empty_fields(self): expected = [['a', 'bc', 'def']] actual = getfields(' - a bc def') self.assertEqual(expected, actual) expected = [['bc', 'def']] class TCunpack_fields(TestCase): def test_with_fields(self): expected = ['a', 'b', 'c', 'd', 'e', 'f'] actual = unpack_fields([['a'], ['b', 'c'], ['d', 'e', 'f']]) self.assertEqual(expected, actual) def test_without_fields(self): expected = ['a', 'b', 'c', 'd', 'e', 'f'] actual = unpack_fields(['a', 'b', 'c', 'd', 'e', 'f']) self.assertEqual(expected, actual) def test_empty(self): self.assertEqual([], unpack_fields([])) class TCWordCompare(TestCase): def test_list(self): self.assertEqual(100, compare(['a', 'b', 'c', 'd'],['a', 'b', 'c', 'd'])) self.assertEqual(86, compare(['a', 'b', 'c', 'd'],['a', 'b', 'c'])) def test_unordered(self): #Sometimes, users don't want fuzzy matching too much When they set the slider #to 100, they don't expect a filename with the same words, but not the same order, to match. #Thus, we want to return 99 in that case. self.assertEqual(99, compare(['a', 'b', 'c', 'd'], ['d', 'b', 'c', 'a'])) def test_word_occurs_twice(self): #if a word occurs twice in first, but once in second, we want the word to be only counted once self.assertEqual(89, compare(['a', 'b', 'c', 'd', 'a'], ['d', 'b', 'c', 'a'])) def test_uses_copy_of_lists(self): first = ['foo', 'bar'] second = ['bar', 'bleh'] compare(first, second) self.assertEqual(['foo', 'bar'], first) self.assertEqual(['bar', 'bleh'], second) def test_word_weight(self): self.assertEqual(int((6.0 / 13.0) * 100), compare(['foo', 'bar'], ['bar', 'bleh'], (WEIGHT_WORDS, ))) def test_similar_words(self): self.assertEqual(100, compare(['the', 'white', 'stripes'],['the', 'whites', 'stripe'], (MATCH_SIMILAR_WORDS, ))) def test_empty(self): self.assertEqual(0, compare([], [])) def test_with_fields(self): self.assertEqual(67, compare([['a', 'b'], ['c', 'd', 'e']], [['a', 'b'], ['c', 'd', 'f']])) def test_propagate_flags_with_fields(self): def mock_compare(first, second, flags): self.assertEqual((0, 1, 2, 3, 5), flags) self.mock(engine, 'compare_fields', mock_compare) compare([['a']], [['a']], (0, 1, 2, 3, 5)) class TCWordCompareWithFields(TestCase): def test_simple(self): self.assertEqual(67, compare_fields([['a', 'b'], ['c', 'd', 'e']], [['a', 'b'], ['c', 'd', 'f']])) def test_empty(self): self.assertEqual(0, compare_fields([], [])) def test_different_length(self): self.assertEqual(0, compare_fields([['a'], ['b']], [['a'], ['b'], ['c']])) def test_propagates_flags(self): def mock_compare(first, second, flags): self.assertEqual((0, 1, 2, 3, 5), flags) self.mock(engine, 'compare_fields', mock_compare) compare_fields([['a']], [['a']],(0, 1, 2, 3, 5)) def test_order(self): first = [['a', 'b'], ['c', 'd', 'e']] second = [['c', 'd', 'f'], ['a', 'b']] self.assertEqual(0, compare_fields(first, second)) def test_no_order(self): first = [['a','b'],['c','d','e']] second = [['c','d','f'],['a','b']] self.assertEqual(67, compare_fields(first, second, (NO_FIELD_ORDER, ))) first = [['a','b'],['a','b']] #a field can only be matched once. second = [['c','d','f'],['a','b']] self.assertEqual(0, compare_fields(first, second, (NO_FIELD_ORDER, ))) first = [['a','b'],['a','b','c']] second = [['c','d','f'],['a','b']] self.assertEqual(33, compare_fields(first, second, (NO_FIELD_ORDER, ))) def test_compare_fields_without_order_doesnt_alter_fields(self): #The NO_ORDER comp type altered the fields! first = [['a','b'],['c','d','e']] second = [['c','d','f'],['a','b']] self.assertEqual(67, compare_fields(first, second, (NO_FIELD_ORDER, ))) self.assertEqual([['a','b'],['c','d','e']],first) self.assertEqual([['c','d','f'],['a','b']],second) class TCbuild_word_dict(TestCase): def test_with_standard_words(self): l = [NamedObject('foo bar',True)] l.append(NamedObject('bar baz',True)) l.append(NamedObject('baz bleh foo',True)) d = build_word_dict(l) self.assertEqual(4,len(d)) self.assertEqual(2,len(d['foo'])) self.assert_(l[0] in d['foo']) self.assert_(l[2] in d['foo']) self.assertEqual(2,len(d['bar'])) self.assert_(l[0] in d['bar']) self.assert_(l[1] in d['bar']) self.assertEqual(2,len(d['baz'])) self.assert_(l[1] in d['baz']) self.assert_(l[2] in d['baz']) self.assertEqual(1,len(d['bleh'])) self.assert_(l[2] in d['bleh']) def test_unpack_fields(self): o = NamedObject('') o.words = [['foo','bar'],['baz']] d = build_word_dict([o]) self.assertEqual(3,len(d)) self.assertEqual(1,len(d['foo'])) def test_words_are_unaltered(self): o = NamedObject('') o.words = [['foo','bar'],['baz']] d = build_word_dict([o]) self.assertEqual([['foo','bar'],['baz']],o.words) def test_object_instances_can_only_be_once_in_words_object_list(self): o = NamedObject('foo foo',True) d = build_word_dict([o]) self.assertEqual(1,len(d['foo'])) def test_job(self): def do_progress(p,d=''): self.log.append(p) return True j = job.Job(1,do_progress) self.log = [] s = "foo bar" build_word_dict([NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j) # We don't have intermediate log because iter_with_progress is called with every > 1 self.assertEqual(0,self.log[0]) self.assertEqual(100,self.log[1]) class TCmerge_similar_words(TestCase): def test_some_similar_words(self): d = { 'foobar':set([1]), 'foobar1':set([2]), 'foobar2':set([3]), } merge_similar_words(d) self.assertEqual(1,len(d)) self.assertEqual(3,len(d['foobar'])) class TCreduce_common_words(TestCase): def test_typical(self): d = { 'foo': set([NamedObject('foo bar',True) for i in range(50)]), 'bar': set([NamedObject('foo bar',True) for i in range(49)]) } reduce_common_words(d, 50) self.assert_('foo' not in d) self.assertEqual(49,len(d['bar'])) def test_dont_remove_objects_with_only_common_words(self): d = { 'common': set([NamedObject("common uncommon",True) for i in range(50)] + [NamedObject("common",True)]), 'uncommon': set([NamedObject("common uncommon",True)]) } reduce_common_words(d, 50) self.assertEqual(1,len(d['common'])) self.assertEqual(1,len(d['uncommon'])) def test_values_still_are_set_instances(self): d = { 'common': set([NamedObject("common uncommon",True) for i in range(50)] + [NamedObject("common",True)]), 'uncommon': set([NamedObject("common uncommon",True)]) } reduce_common_words(d, 50) self.assert_(isinstance(d['common'],set)) self.assert_(isinstance(d['uncommon'],set)) def test_dont_raise_KeyError_when_a_word_has_been_removed(self): #If a word has been removed by the reduce, an object in a subsequent common word that #contains the word that has been removed would cause a KeyError. d = { 'foo': set([NamedObject('foo bar baz',True) for i in range(50)]), 'bar': set([NamedObject('foo bar baz',True) for i in range(50)]), 'baz': set([NamedObject('foo bar baz',True) for i in range(49)]) } try: reduce_common_words(d, 50) except KeyError: self.fail() def test_unpack_fields(self): #object.words may be fields. def create_it(): o = NamedObject('') o.words = [['foo','bar'],['baz']] return o d = { 'foo': set([create_it() for i in range(50)]) } try: reduce_common_words(d, 50) except TypeError: self.fail("must support fields.") def test_consider_a_reduced_common_word_common_even_after_reduction(self): #There was a bug in the code that causeda word that has already been reduced not to #be counted as a common word for subsequent words. For example, if 'foo' is processed #as a common word, keeping a "foo bar" file in it, and the 'bar' is processed, "foo bar" #would not stay in 'bar' because 'foo' is not a common word anymore. only_common = NamedObject('foo bar',True) d = { 'foo': set([NamedObject('foo bar baz',True) for i in range(49)] + [only_common]), 'bar': set([NamedObject('foo bar baz',True) for i in range(49)] + [only_common]), 'baz': set([NamedObject('foo bar baz',True) for i in range(49)]) } reduce_common_words(d, 50) self.assertEqual(1,len(d['foo'])) self.assertEqual(1,len(d['bar'])) self.assertEqual(49,len(d['baz'])) class TCget_match(TestCase): def test_simple(self): o1 = NamedObject("foo bar",True) o2 = NamedObject("bar bleh",True) m = get_match(o1,o2) self.assertEqual(50,m.percentage) self.assertEqual(['foo','bar'],m.first.words) self.assertEqual(['bar','bleh'],m.second.words) self.assert_(m.first is o1) self.assert_(m.second is o2) def test_in(self): o1 = NamedObject("foo",True) o2 = NamedObject("bar",True) m = get_match(o1,o2) self.assert_(o1 in m) self.assert_(o2 in m) self.assert_(object() not in m) def test_word_weight(self): self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage) class GetMatches(TestCase): def test_empty(self): eq_(getmatches([]), []) def test_simple(self): l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")] r = getmatches(l) self.assertEqual(2,len(r)) m = first(m for m in r if m.percentage == 50) #"foo bar" and "bar bleh" assert_match(m, 'foo bar', 'bar bleh') m = first(m for m in r if m.percentage == 33) #"foo bar" and "a b c foo" assert_match(m, 'foo bar', 'a b c foo') def test_null_and_unrelated_objects(self): l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")] r = getmatches(l) eq_(len(r), 1) m = r[0] eq_(m.percentage, 50) assert_match(m, 'foo bar', 'bar bleh') def test_twice_the_same_word(self): l = [NamedObject("foo foo bar"),NamedObject("bar bleh")] r = getmatches(l) self.assertEqual(1,len(r)) def test_twice_the_same_word_when_preworded(self): l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)] r = getmatches(l) self.assertEqual(1,len(r)) def test_two_words_match(self): l = [NamedObject("foo bar"),NamedObject("foo bar bleh")] r = getmatches(l) self.assertEqual(1,len(r)) def test_match_files_with_only_common_words(self): #If a word occurs more than 50 times, it is excluded from the matching process #The problem with the common_word_threshold is that the files containing only common #words will never be matched together. We *should* match them. # This test assumes that the common word threashold const is 50 l = [NamedObject("foo") for i in range(50)] r = getmatches(l) self.assertEqual(1225,len(r)) def test_use_words_already_there_if_there(self): o1 = NamedObject('foo') o2 = NamedObject('bar') o2.words = ['foo'] eq_(1, len(getmatches([o1,o2]))) def test_job(self): def do_progress(p,d=''): self.log.append(p) return True j = job.Job(1,do_progress) self.log = [] s = "foo bar" getmatches([NamedObject(s), NamedObject(s), NamedObject(s)], j=j) self.assert_(len(self.log) > 2) self.assertEqual(0,self.log[0]) self.assertEqual(100,self.log[-1]) def test_weight_words(self): l = [NamedObject("foo bar"),NamedObject("bar bleh")] m = getmatches(l, weight_words=True)[0] self.assertEqual(int((6.0 / 13.0) * 100),m.percentage) def test_similar_word(self): l = [NamedObject("foobar"),NamedObject("foobars")] eq_(len(getmatches(l, match_similar_words=True)), 1) eq_(getmatches(l, match_similar_words=True)[0].percentage, 100) l = [NamedObject("foobar"),NamedObject("foo")] eq_(len(getmatches(l, match_similar_words=True)), 0) #too far l = [NamedObject("bizkit"),NamedObject("bizket")] eq_(len(getmatches(l, match_similar_words=True)), 1) l = [NamedObject("foobar"),NamedObject("foosbar")] eq_(len(getmatches(l, match_similar_words=True)), 1) def test_single_object_with_similar_words(self): l = [NamedObject("foo foos")] eq_(len(getmatches(l, match_similar_words=True)), 0) def test_double_words_get_counted_only_once(self): l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")] m = getmatches(l)[0] self.assertEqual(75,m.percentage) def test_with_fields(self): o1 = NamedObject("foo bar - foo bleh") o2 = NamedObject("foo bar - bleh bar") o1.words = getfields(o1.name) o2.words = getfields(o2.name) m = getmatches([o1, o2])[0] self.assertEqual(50, m.percentage) def test_with_fields_no_order(self): o1 = NamedObject("foo bar - foo bleh") o2 = NamedObject("bleh bang - foo bar") o1.words = getfields(o1.name) o2.words = getfields(o2.name) m = getmatches([o1, o2], no_field_order=True)[0] eq_(m.percentage, 50) def test_only_match_similar_when_the_option_is_set(self): l = [NamedObject("foobar"),NamedObject("foobars")] eq_(len(getmatches(l, match_similar_words=False)), 0) def test_dont_recurse_do_match(self): # with nosetests, the stack is increased. The number has to be high enough not to be failing falsely sys.setrecursionlimit(100) files = [NamedObject('foo bar') for i in range(101)] try: getmatches(files) except RuntimeError: self.fail() finally: sys.setrecursionlimit(1000) def test_min_match_percentage(self): l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")] r = getmatches(l, min_match_percentage=50) self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match def test_MemoryError(self): @log_calls def mocked_match(first, second, flags): if len(mocked_match.calls) > 42: raise MemoryError() return Match(first, second, 0) objects = [NamedObject() for i in range(10)] # results in 45 matches self.mock(engine, 'get_match', mocked_match) try: r = getmatches(objects) except MemoryError: self.fail('MemorryError must be handled') self.assertEqual(42, len(r)) class GetMatchesByContents(TestCase): def test_dont_compare_empty_files(self): o1, o2 = no(size=0), no(size=0) assert not getmatches_by_contents([o1, o2]) class TCGroup(TestCase): def test_empy(self): g = Group() self.assertEqual(None,g.ref) self.assertEqual([],g.dupes) self.assertEqual(0,len(g.matches)) def test_add_match(self): g = Group() m = get_match(NamedObject("foo",True),NamedObject("bar",True)) g.add_match(m) self.assert_(g.ref is m.first) self.assertEqual([m.second],g.dupes) self.assertEqual(1,len(g.matches)) self.assert_(m in g.matches) def test_multiple_add_match(self): g = Group() o1 = NamedObject("a",True) o2 = NamedObject("b",True) o3 = NamedObject("c",True) o4 = NamedObject("d",True) g.add_match(get_match(o1,o2)) self.assert_(g.ref is o1) self.assertEqual([o2],g.dupes) self.assertEqual(1,len(g.matches)) g.add_match(get_match(o1,o3)) self.assertEqual([o2],g.dupes) self.assertEqual(2,len(g.matches)) g.add_match(get_match(o2,o3)) self.assertEqual([o2,o3],g.dupes) self.assertEqual(3,len(g.matches)) g.add_match(get_match(o1,o4)) self.assertEqual([o2,o3],g.dupes) self.assertEqual(4,len(g.matches)) g.add_match(get_match(o2,o4)) self.assertEqual([o2,o3],g.dupes) self.assertEqual(5,len(g.matches)) g.add_match(get_match(o3,o4)) self.assertEqual([o2,o3,o4],g.dupes) self.assertEqual(6,len(g.matches)) def test_len(self): g = Group() self.assertEqual(0,len(g)) g.add_match(get_match(NamedObject("foo",True),NamedObject("bar",True))) self.assertEqual(2,len(g)) def test_add_same_match_twice(self): g = Group() m = get_match(NamedObject("foo",True),NamedObject("foo",True)) g.add_match(m) self.assertEqual(2,len(g)) self.assertEqual(1,len(g.matches)) g.add_match(m) self.assertEqual(2,len(g)) self.assertEqual(1,len(g.matches)) def test_in(self): g = Group() o1 = NamedObject("foo",True) o2 = NamedObject("bar",True) self.assert_(o1 not in g) g.add_match(get_match(o1,o2)) self.assert_(o1 in g) self.assert_(o2 in g) def test_remove(self): g = Group() o1 = NamedObject("foo",True) o2 = NamedObject("bar",True) o3 = NamedObject("bleh",True) g.add_match(get_match(o1,o2)) g.add_match(get_match(o1,o3)) g.add_match(get_match(o2,o3)) self.assertEqual(3,len(g.matches)) self.assertEqual(3,len(g)) g.remove_dupe(o3) self.assertEqual(1,len(g.matches)) self.assertEqual(2,len(g)) g.remove_dupe(o1) self.assertEqual(0,len(g.matches)) self.assertEqual(0,len(g)) def test_remove_with_ref_dupes(self): g = Group() o1 = NamedObject("foo",True) o2 = NamedObject("bar",True) o3 = NamedObject("bleh",True) g.add_match(get_match(o1,o2)) g.add_match(get_match(o1,o3)) g.add_match(get_match(o2,o3)) o1.is_ref = True o2.is_ref = True g.remove_dupe(o3) self.assertEqual(0,len(g)) def test_switch_ref(self): o1 = NamedObject(with_words=True) o2 = NamedObject(with_words=True) g = Group() g.add_match(get_match(o1,o2)) self.assert_(o1 is g.ref) g.switch_ref(o2) self.assert_(o2 is g.ref) self.assertEqual([o1],g.dupes) g.switch_ref(o2) self.assert_(o2 is g.ref) g.switch_ref(NamedObject('',True)) self.assert_(o2 is g.ref) def test_get_match_of(self): g = Group() for m in get_match_triangle(): g.add_match(m) o = g.dupes[0] m = g.get_match_of(o) self.assert_(g.ref in m) self.assert_(o in m) self.assert_(g.get_match_of(NamedObject('',True)) is None) self.assert_(g.get_match_of(g.ref) is None) def test_percentage(self): #percentage should return the avg percentage in relation to the ref m1,m2,m3 = get_match_triangle() m1 = Match(m1[0], m1[1], 100) m2 = Match(m2[0], m2[1], 50) m3 = Match(m3[0], m3[1], 33) g = Group() g.add_match(m1) g.add_match(m2) g.add_match(m3) self.assertEqual(75,g.percentage) g.switch_ref(g.dupes[0]) self.assertEqual(66,g.percentage) g.remove_dupe(g.dupes[0]) self.assertEqual(33,g.percentage) g.add_match(m1) g.add_match(m2) self.assertEqual(66,g.percentage) def test_percentage_on_empty_group(self): g = Group() self.assertEqual(0,g.percentage) def test_prioritize(self): m1,m2,m3 = get_match_triangle() o1 = m1.first o2 = m1.second o3 = m2.second o1.name = 'c' o2.name = 'b' o3.name = 'a' g = Group() g.add_match(m1) g.add_match(m2) g.add_match(m3) self.assert_(o1 is g.ref) g.prioritize(lambda x:x.name) self.assert_(o3 is g.ref) def test_prioritize_with_tie_breaker(self): # if the ref has the same key as one or more of the dupe, run the tie_breaker func among them g = get_test_group() o1, o2, o3 = g.ordered tie_breaker = lambda ref, dupe: dupe is o3 g.prioritize(lambda x:0, tie_breaker) self.assertTrue(g.ref is o3) def test_prioritize_with_tie_breaker_runs_on_all_dupes(self): # Even if a dupe is chosen to switch with ref with a tie breaker, we still run the tie breaker # with other dupes and the newly chosen ref g = get_test_group() o1, o2, o3 = g.ordered o1.foo = 1 o2.foo = 2 o3.foo = 3 tie_breaker = lambda ref, dupe: dupe.foo > ref.foo g.prioritize(lambda x:0, tie_breaker) self.assertTrue(g.ref is o3) def test_prioritize_with_tie_breaker_runs_only_on_tie_dupes(self): # The tie breaker only runs on dupes that had the same value for the key_func g = get_test_group() o1, o2, o3 = g.ordered o1.foo = 2 o2.foo = 2 o3.foo = 1 o1.bar = 1 o2.bar = 2 o3.bar = 3 key_func = lambda x: -x.foo tie_breaker = lambda ref, dupe: dupe.bar > ref.bar g.prioritize(key_func, tie_breaker) self.assertTrue(g.ref is o2) def test_list_like(self): g = Group() o1,o2 = (NamedObject("foo",True),NamedObject("bar",True)) g.add_match(get_match(o1,o2)) self.assert_(g[0] is o1) self.assert_(g[1] is o2) def test_discard_matches(self): g = Group() o1,o2,o3 = (NamedObject("foo",True),NamedObject("bar",True),NamedObject("baz",True)) g.add_match(get_match(o1,o2)) g.add_match(get_match(o1,o3)) g.discard_matches() self.assertEqual(1,len(g.matches)) self.assertEqual(0,len(g.candidates)) class TCget_groups(TestCase): def test_empty(self): r = get_groups([]) self.assertEqual([],r) def test_simple(self): l = [NamedObject("foo bar"),NamedObject("bar bleh")] matches = getmatches(l) m = matches[0] r = get_groups(matches) self.assertEqual(1,len(r)) g = r[0] self.assert_(g.ref is m.first) self.assertEqual([m.second],g.dupes) def test_group_with_multiple_matches(self): #This results in 3 matches l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")] matches = getmatches(l) r = get_groups(matches) self.assertEqual(1,len(r)) g = r[0] self.assertEqual(3,len(g)) def test_must_choose_a_group(self): l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")] #There will be 2 groups here: group "a b" and group "c d" #"b c" can go either of them, but not both. matches = getmatches(l) r = get_groups(matches) self.assertEqual(2,len(r)) self.assertEqual(5,len(r[0])+len(r[1])) def test_should_all_go_in_the_same_group(self): l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")] #There will be 2 groups here: group "a b" and group "c d" #"b c" can fit in both, but it must be in only one of them matches = getmatches(l) r = get_groups(matches) self.assertEqual(1,len(r)) def test_give_priority_to_matches_with_higher_percentage(self): o1 = NamedObject(with_words=True) o2 = NamedObject(with_words=True) o3 = NamedObject(with_words=True) m1 = Match(o1, o2, 1) m2 = Match(o2, o3, 2) r = get_groups([m1,m2]) self.assertEqual(1,len(r)) g = r[0] self.assertEqual(2,len(g)) self.assert_(o1 not in g) self.assert_(o2 in g) self.assert_(o3 in g) def test_four_sized_group(self): l = [NamedObject("foobar") for i in range(4)] m = getmatches(l) r = get_groups(m) self.assertEqual(1,len(r)) self.assertEqual(4,len(r[0])) def test_referenced_by_ref2(self): o1 = NamedObject(with_words=True) o2 = NamedObject(with_words=True) o3 = NamedObject(with_words=True) m1 = get_match(o1,o2) m2 = get_match(o3,o1) m3 = get_match(o3,o2) r = get_groups([m1,m2,m3]) self.assertEqual(3,len(r[0])) def test_job(self): def do_progress(p,d=''): self.log.append(p) return True self.log = [] j = job.Job(1,do_progress) m1,m2,m3 = get_match_triangle() #101%: To make sure it is processed first so the job test works correctly m4 = Match(NamedObject('a',True), NamedObject('a',True), 101) get_groups([m1,m2,m3,m4],j) self.assertEqual(0,self.log[0]) self.assertEqual(100,self.log[-1]) def test_group_admissible_discarded_dupes(self): # If, with a (A, B, C, D) set, all match with A, but C and D don't match with B and that the # (A, B) match is the highest (thus resulting in an (A, B) group), still match C and D # in a separate group instead of discarding them. A, B, C, D = [NamedObject() for _ in range(4)] m1 = Match(A, B, 90) # This is the strongest "A" match m2 = Match(A, C, 80) # Because C doesn't match with B, it won't be in the group m3 = Match(A, D, 80) # Same thing for D m4 = Match(C, D, 70) # However, because C and D match, they should have their own group. groups = get_groups([m1, m2, m3, m4]) eq_(len(groups), 2) g1, g2 = groups assert A in g1 assert B in g1 assert C in g2 assert D in g2