From 42ebef15dd774b8be4bc1f8d536bd7c05451e7ab Mon Sep 17 00:00:00 2001 From: hsoft Date: Sat, 5 Sep 2009 15:28:10 +0000 Subject: [PATCH] Refactoring: modernized scaner_test and got rid of the obsolete SCAN_TYPE_TAG_WITH_ALBUM scan type const. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40117 --- base/py/scanner.py | 8 +- base/py/tests/scanner_test.py | 872 +++++++++++++++++----------------- 2 files changed, 440 insertions(+), 440 deletions(-) diff --git a/base/py/scanner.py b/base/py/scanner.py index 0ac87dc2..320151c2 100644 --- a/base/py/scanner.py +++ b/base/py/scanner.py @@ -9,21 +9,20 @@ import logging -from ignore import IgnoreList from hsutil import job from hsutil.misc import dedupe from hsutil.str import get_file_ext, rem_file_ext from . import engine +from .ignore import IgnoreList (SCAN_TYPE_FILENAME, SCAN_TYPE_FIELDS, SCAN_TYPE_FIELDS_NO_ORDER, SCAN_TYPE_TAG, -SCAN_TYPE_TAG_WITH_ALBUM, # Obsolete SCAN_TYPE_CONTENT, -SCAN_TYPE_CONTENT_AUDIO) = range(7) +SCAN_TYPE_CONTENT_AUDIO) = range(6) SCANNABLE_TAGS = ['track', 'artist', 'album', 'title', 'genre', 'year'] @@ -42,9 +41,6 @@ class Scanner(object): if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER: self.scan_type = SCAN_TYPE_FIELDS mf.no_field_order = True - if self.scan_type == SCAN_TYPE_TAG_WITH_ALBUM: - self.scan_type = SCAN_TYPE_TAG - self.scanned_tags = set(['artist', 'album', 'title']) func = { SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)), SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)), diff --git a/base/py/tests/scanner_test.py b/base/py/tests/scanner_test.py index 8cd9587a..5356d030 100644 --- a/base/py/tests/scanner_test.py +++ b/base/py/tests/scanner_test.py @@ -7,9 +7,10 @@ # which should be included with this package. The terms are also available at # http://www.hardcoded.net/licenses/hs_license +from nose.tools import eq_ + from hsutil import job from hsutil.path import Path -from hsutil.testcase import TestCase from ..engine import getwords, Match from ..ignore import IgnoreList @@ -25,438 +26,441 @@ class NamedObject(object): no = NamedObject -class TCScanner(TestCase): - def test_empty(self): - s = Scanner() - r = s.GetDupeGroups([]) - self.assertEqual([],r) - - def test_default_settings(self): - s = Scanner() - self.assertEqual(80,s.min_match_percentage) - self.assertEqual(SCAN_TYPE_FILENAME,s.scan_type) - self.assertEqual(True,s.mix_file_kind) - self.assertEqual(False,s.word_weighting) - self.assertEqual(False,s.match_similar_words) - self.assert_(isinstance(s.ignore_list,IgnoreList)) - - def test_simple_with_default_settings(self): - s = Scanner() - f = [no('foo bar'),no('foo bar'),no('foo bleh')] - r = s.GetDupeGroups(f) - self.assertEqual(1,len(r)) - g = r[0] - #'foo bleh' cannot be in the group because the default min match % is 80 - self.assertEqual(2,len(g)) - self.assert_(g.ref in f[:2]) - self.assert_(g.dupes[0] in f[:2]) - - def test_simple_with_lower_min_match(self): - s = Scanner() - s.min_match_percentage = 50 - f = [no('foo bar'),no('foo bar'),no('foo bleh')] - r = s.GetDupeGroups(f) - self.assertEqual(1,len(r)) - g = r[0] - self.assertEqual(3,len(g)) - - def test_trim_all_ref_groups(self): - s = Scanner() - f = [no('foo'),no('foo'),no('bar'),no('bar')] - f[2].is_ref = True - f[3].is_ref = True - r = s.GetDupeGroups(f) - self.assertEqual(1,len(r)) - - def test_priorize(self): - s = Scanner() - f = [no('foo'),no('foo'),no('bar'),no('bar')] - f[1].size = 2 - f[2].size = 3 - f[3].is_ref = True - r = s.GetDupeGroups(f) - g1,g2 = r - self.assert_(f[1] in (g1.ref,g2.ref)) - self.assert_(f[0] in (g1.dupes[0],g2.dupes[0])) - self.assert_(f[3] in (g1.ref,g2.ref)) - self.assert_(f[2] in (g1.dupes[0],g2.dupes[0])) - - def test_content_scan(self): - s = Scanner() - s.scan_type = SCAN_TYPE_CONTENT - f = [no('foo'), no('bar'), no('bleh')] - f[0].md5 = 'foobar' - f[1].md5 = 'foobar' - f[2].md5 = 'bleh' - r = s.GetDupeGroups(f) - self.assertEqual(len(r), 1) - self.assertEqual(len(r[0]), 2) - self.assertEqual(s.discarded_file_count, 0) # don't count the different md5 as discarded! - - def test_content_scan_compare_sizes_first(self): - class MyFile(no): - def get_md5(file): - self.fail() - md5 = property(get_md5) - - s = Scanner() - s.scan_type = SCAN_TYPE_CONTENT - f = [MyFile('foo',1),MyFile('bar',2)] - self.assertEqual(0,len(s.GetDupeGroups(f))) - - def test_min_match_perc_doesnt_matter_for_content_scan(self): - s = Scanner() - s.scan_type = SCAN_TYPE_CONTENT - f = [no('foo'),no('bar'),no('bleh')] - f[0].md5 = 'foobar' - f[1].md5 = 'foobar' - f[2].md5 = 'bleh' - s.min_match_percentage = 101 - r = s.GetDupeGroups(f) - self.assertEqual(1,len(r)) - self.assertEqual(2,len(r[0])) - s.min_match_percentage = 0 - r = s.GetDupeGroups(f) - self.assertEqual(1,len(r)) - self.assertEqual(2,len(r[0])) - - def test_content_scan_puts_md5_in_words_at_the_end(self): - s = Scanner() - s.scan_type = SCAN_TYPE_CONTENT - f = [no('foo'),no('bar')] - f[0].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' - f[1].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' - r = s.GetDupeGroups(f) - g = r[0] - self.assertEqual(['--'],g.ref.words) - self.assertEqual(['--'],g.dupes[0].words) - - def test_extension_is_not_counted_in_filename_scan(self): - s = Scanner() - s.min_match_percentage = 100 - f = [no('foo.bar'),no('foo.bleh')] - r = s.GetDupeGroups(f) - self.assertEqual(1,len(r)) - self.assertEqual(2,len(r[0])) - - def test_job(self): - def do_progress(progress,desc=''): - log.append(progress) - return True - s = Scanner() - log = [] - f = [no('foo bar'),no('foo bar'),no('foo bleh')] - r = s.GetDupeGroups(f, job.Job(1,do_progress)) - self.assertEqual(0,log[0]) - self.assertEqual(100,log[-1]) - - def test_mix_file_kind(self): - s = Scanner() - s.mix_file_kind = False - f = [no('foo.1'),no('foo.2')] - r = s.GetDupeGroups(f) - self.assertEqual(0,len(r)) - - def test_word_weighting(self): - s = Scanner() - s.min_match_percentage = 75 - s.word_weighting = True - f = [no('foo bar'),no('foo bar bleh')] - r = s.GetDupeGroups(f) - self.assertEqual(1,len(r)) - g = r[0] - m = g.get_match_of(g.dupes[0]) - self.assertEqual(75,m.percentage) # 16 letters, 12 matching - - def test_similar_words(self): - s = Scanner() - s.match_similar_words = True - f = [no('The White Stripes'),no('The Whites Stripe'),no('Limp Bizkit'),no('Limp Bizkitt')] - r = s.GetDupeGroups(f) - self.assertEqual(2,len(r)) - - def test_fields(self): - s = Scanner() - s.scan_type = SCAN_TYPE_FIELDS - f = [no('The White Stripes - Little Ghost'),no('The White Stripes - Little Acorn')] - r = s.GetDupeGroups(f) - self.assertEqual(0,len(r)) - - def test_fields_no_order(self): - s = Scanner() - s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER - f = [no('The White Stripes - Little Ghost'),no('Little Ghost - The White Stripes')] - r = s.GetDupeGroups(f) - self.assertEqual(1,len(r)) - - def test_tag_scan(self): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG - o1 = no('foo') - o2 = no('bar') - o1.artist = 'The White Stripes' - o1.title = 'The Air Near My Fingers' - o2.artist = 'The White Stripes' - o2.title = 'The Air Near My Fingers' - r = s.GetDupeGroups([o1,o2]) - self.assertEqual(1,len(r)) - - def test_tag_with_album_scan(self): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG_WITH_ALBUM - o1 = no('foo') - o2 = no('bar') - o3 = no('bleh') - o1.artist = 'The White Stripes' - o1.title = 'The Air Near My Fingers' - o1.album = 'Elephant' - o2.artist = 'The White Stripes' - o2.title = 'The Air Near My Fingers' - o2.album = 'Elephant' - o3.artist = 'The White Stripes' - o3.title = 'The Air Near My Fingers' - o3.album = 'foobar' - r = s.GetDupeGroups([o1,o2,o3]) - self.assertEqual(1,len(r)) - - def test_that_dash_in_tags_dont_create_new_fields(self): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG_WITH_ALBUM - s.min_match_percentage = 50 - o1 = no('foo') - o2 = no('bar') - o1.artist = 'The White Stripes - a' - o1.title = 'The Air Near My Fingers - a' - o1.album = 'Elephant - a' - o2.artist = 'The White Stripes - b' - o2.title = 'The Air Near My Fingers - b' - o2.album = 'Elephant - b' - r = s.GetDupeGroups([o1,o2]) - self.assertEqual(1,len(r)) - - def test_tag_scan_with_different_scanned(self): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG - s.scanned_tags = set(['track', 'year']) - o1 = no('foo') - o2 = no('bar') - o1.artist = 'The White Stripes' - o1.title = 'some title' - o1.track = 'foo' - o1.year = 'bar' - o2.artist = 'The White Stripes' - o2.title = 'another title' - o2.track = 'foo' - o2.year = 'bar' - r = s.GetDupeGroups([o1, o2]) - self.assertEqual(1, len(r)) - - def test_tag_scan_only_scans_existing_tags(self): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG - s.scanned_tags = set(['artist', 'foo']) - o1 = no('foo') - o2 = no('bar') - o1.artist = 'The White Stripes' - o1.foo = 'foo' - o2.artist = 'The White Stripes' - o2.foo = 'bar' - r = s.GetDupeGroups([o1, o2]) - self.assertEqual(1, len(r)) # Because 'foo' is not scanned, they match - - def test_tag_scan_converts_to_str(self): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG - s.scanned_tags = set(['track']) - o1 = no('foo') - o2 = no('bar') - o1.track = 42 - o2.track = 42 - try: - r = s.GetDupeGroups([o1, o2]) - except TypeError: - self.fail() - self.assertEqual(1, len(r)) - - def test_tag_scan_non_ascii(self): - s = Scanner() - s.scan_type = SCAN_TYPE_TAG - s.scanned_tags = set(['title']) - o1 = no('foo') - o2 = no('bar') - o1.title = u'foobar\u00e9' - o2.title = u'foobar\u00e9' - try: - r = s.GetDupeGroups([o1, o2]) - except UnicodeEncodeError: - self.fail() - self.assertEqual(1, len(r)) - - def test_audio_content_scan(self): - s = Scanner() - s.scan_type = SCAN_TYPE_CONTENT_AUDIO - f = [no('foo'),no('bar'),no('bleh')] - f[0].md5 = 'foo' - f[1].md5 = 'bar' - f[2].md5 = 'bleh' - f[0].md5partial = 'foo' - f[1].md5partial = 'foo' - f[2].md5partial = 'bleh' - f[0].audiosize = 1 - f[1].audiosize = 1 - f[2].audiosize = 1 - r = s.GetDupeGroups(f) - self.assertEqual(1,len(r)) - self.assertEqual(2,len(r[0])) - - def test_audio_content_scan_compare_sizes_first(self): - class MyFile(no): - def get_md5(file): - self.fail() - md5partial = property(get_md5) - - s = Scanner() - s.scan_type = SCAN_TYPE_CONTENT_AUDIO - f = [MyFile('foo'),MyFile('bar')] - f[0].audiosize = 1 - f[1].audiosize = 2 - self.assertEqual(0,len(s.GetDupeGroups(f))) - - def test_ignore_list(self): - s = Scanner() - f1 = no('foobar') - f2 = no('foobar') - f3 = no('foobar') - f1.path = Path('dir1/foobar') - f2.path = Path('dir2/foobar') - f3.path = Path('dir3/foobar') - s.ignore_list.Ignore(str(f1.path),str(f2.path)) - s.ignore_list.Ignore(str(f1.path),str(f3.path)) - r = s.GetDupeGroups([f1,f2,f3]) - self.assertEqual(1,len(r)) - g = r[0] - self.assertEqual(1,len(g.dupes)) - self.assert_(f1 not in g) - self.assert_(f2 in g) - self.assert_(f3 in g) - # Ignored matches are not counted as discarded - self.assertEqual(s.discarded_file_count, 0) - - def test_ignore_list_checks_for_unicode(self): - #scanner was calling path_str for ignore list checks. Since the Path changes, it must - #be unicode(path) - s = Scanner() - f1 = no('foobar') - f2 = no('foobar') - f3 = no('foobar') - f1.path = Path(u'foo1\u00e9') - f2.path = Path(u'foo2\u00e9') - f3.path = Path(u'foo3\u00e9') - s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path)) - s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path)) - r = s.GetDupeGroups([f1,f2,f3]) - self.assertEqual(1,len(r)) - g = r[0] - self.assertEqual(1,len(g.dupes)) - self.assert_(f1 not in g) - self.assert_(f2 in g) - self.assert_(f3 in g) - - def test_custom_match_factory(self): - class MatchFactory(object): - def getmatches(self,objects,j=None): - return [Match(objects[0], objects[1], 420)] - - - s = Scanner() - s.match_factory = MatchFactory() - o1,o2 = no('foo'),no('bar') - groups = s.GetDupeGroups([o1,o2]) - self.assertEqual(1,len(groups)) - g = groups[0] - self.assertEqual(2,len(g)) - g.switch_ref(o1) - m = g.get_match_of(o2) - self.assertEqual((o1,o2,420),m) - - def test_file_evaluates_to_false(self): - # A very wrong way to use any() was added at some point, causing resulting group list - # to be empty. - class FalseNamedObject(NamedObject): - def __nonzero__(self): - return False - - - s = Scanner() - f1 = FalseNamedObject('foobar') - f2 = FalseNamedObject('foobar') - r = s.GetDupeGroups([f1,f2]) - self.assertEqual(1,len(r)) - - def test_size_threshold(self): - # Only file equal or higher than the size_threshold in size are scanned - s = Scanner() - f1 = no('foo', 1) - f2 = no('foo', 2) - f3 = no('foo', 3) - s.size_threshold = 2 - groups = s.GetDupeGroups([f1,f2,f3]) - self.assertEqual(len(groups), 1) - [group] = groups - self.assertEqual(len(group), 2) - self.assertTrue(f1 not in group) - self.assertTrue(f2 in group) - self.assertTrue(f3 in group) - - def test_tie_breaker_path_deepness(self): - # If there is a tie in prioritization, path deepness is used as a tie breaker - s = Scanner() - o1, o2 = no('foo'), no('foo') - o1.path = Path('foo') - o2.path = Path('foo/bar') - [group] = s.GetDupeGroups([o1, o2]) - self.assertTrue(group.ref is o2) - - def test_tie_breaker_copy(self): - # if copy is in the words used (even if it has a deeper path), it becomes a dupe - s = Scanner() - o1, o2 = no('foo bar Copy'), no('foo bar') - o1.path = Path('deeper/path') - o2.path = Path('foo') - [group] = s.GetDupeGroups([o1, o2]) - self.assertTrue(group.ref is o2) - - def test_tie_breaker_same_name_plus_digit(self): - # if ref has the same words as dupe, but has some just one extra word which is a digit, it - # becomes a dupe - s = Scanner() - o1, o2 = no('foo bar 42'), no('foo bar') - o1.path = Path('deeper/path') - o2.path = Path('foo') - [group] = s.GetDupeGroups([o1, o2]) - self.assertTrue(group.ref is o2) - - def test_partial_group_match(self): - # Count the number od discarded matches (when a file doesn't match all other dupes of the - # group) in Scanner.discarded_file_count - s = Scanner() - o1, o2, o3 = no('a b'), no('a'), no('b') - s.min_match_percentage = 50 - [group] = s.GetDupeGroups([o1, o2, o3]) - self.assertEqual(len(group), 2) - self.assertTrue(o1 in group) - self.assertTrue(o2 in group) - self.assertTrue(o3 not in group) - self.assertEqual(s.discarded_file_count, 1) - +#--- Scanner +def test_empty(): + s = Scanner() + r = s.GetDupeGroups([]) + eq_(r, []) -class TCScannerME(TestCase): - def test_priorize(self): - # in ScannerME, bitrate goes first (right after is_ref) in priorization - s = ScannerME() - o1, o2 = no('foo'), no('foo') - o1.bitrate = 1 - o2.bitrate = 2 - [group] = s.GetDupeGroups([o1, o2]) - self.assertTrue(group.ref is o2) +def test_default_settings(): + s = Scanner() + eq_(s.min_match_percentage, 80) + eq_(s.scan_type, SCAN_TYPE_FILENAME) + eq_(s.mix_file_kind, True) + eq_(s.word_weighting, False) + eq_(s.match_similar_words, False) + assert isinstance(s.ignore_list, IgnoreList) + +def test_simple_with_default_settings(): + s = Scanner() + f = [no('foo bar'), no('foo bar'), no('foo bleh')] + r = s.GetDupeGroups(f) + eq_(len(r), 1) + g = r[0] + #'foo bleh' cannot be in the group because the default min match % is 80 + eq_(len(g), 2) + assert g.ref in f[:2] + assert g.dupes[0] in f[:2] + +def test_simple_with_lower_min_match(): + s = Scanner() + s.min_match_percentage = 50 + f = [no('foo bar'), no('foo bar'), no('foo bleh')] + r = s.GetDupeGroups(f) + eq_(len(r), 1) + g = r[0] + eq_(len(g), 3) + +def test_trim_all_ref_groups(): + s = Scanner() + f = [no('foo'), no('foo'), no('bar'), no('bar')] + f[2].is_ref = True + f[3].is_ref = True + r = s.GetDupeGroups(f) + eq_(len(r), 1) + +def test_priorize(): + s = Scanner() + f = [no('foo'), no('foo'), no('bar'), no('bar')] + f[1].size = 2 + f[2].size = 3 + f[3].is_ref = True + r = s.GetDupeGroups(f) + g1, g2 = r + assert f[1] in (g1.ref,g2.ref) + assert f[0] in (g1.dupes[0],g2.dupes[0]) + assert f[3] in (g1.ref,g2.ref) + assert f[2] in (g1.dupes[0],g2.dupes[0]) + +def test_content_scan(): + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT + f = [no('foo'), no('bar'), no('bleh')] + f[0].md5 = 'foobar' + f[1].md5 = 'foobar' + f[2].md5 = 'bleh' + r = s.GetDupeGroups(f) + eq_(len(r), 1) + eq_(len(r[0]), 2) + eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded! + +def test_content_scan_compare_sizes_first(): + class MyFile(no): + @property + def md5(file): + raise AssertionError() + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT + f = [MyFile('foo', 1), MyFile('bar', 2)] + eq_(len(s.GetDupeGroups(f)), 0) + +def test_min_match_perc_doesnt_matter_for_content_scan(): + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT + f = [no('foo'), no('bar'), no('bleh')] + f[0].md5 = 'foobar' + f[1].md5 = 'foobar' + f[2].md5 = 'bleh' + s.min_match_percentage = 101 + r = s.GetDupeGroups(f) + eq_(len(r), 1) + eq_(len(r[0]), 2) + s.min_match_percentage = 0 + r = s.GetDupeGroups(f) + eq_(len(r), 1) + eq_(len(r[0]), 2) + +def test_content_scan_puts_md5_in_words_at_the_end(): + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT + f = [no('foo'),no('bar')] + f[0].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' + f[1].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' + r = s.GetDupeGroups(f) + g = r[0] + eq_(g.ref.words, ['--']) + eq_(g.dupes[0].words, ['--']) + +def test_extension_is_not_counted_in_filename_scan(): + s = Scanner() + s.min_match_percentage = 100 + f = [no('foo.bar'), no('foo.bleh')] + r = s.GetDupeGroups(f) + eq_(len(r), 1) + eq_(len(r[0]), 2) + +def test_job(): + def do_progress(progress, desc=''): + log.append(progress) + return True + + s = Scanner() + log = [] + f = [no('foo bar'), no('foo bar'), no('foo bleh')] + r = s.GetDupeGroups(f, job.Job(1, do_progress)) + eq_(log[0], 0) + eq_(log[-1], 100) + +def test_mix_file_kind(): + s = Scanner() + s.mix_file_kind = False + f = [no('foo.1'), no('foo.2')] + r = s.GetDupeGroups(f) + eq_(len(r), 0) + +def test_word_weighting(): + s = Scanner() + s.min_match_percentage = 75 + s.word_weighting = True + f = [no('foo bar'), no('foo bar bleh')] + r = s.GetDupeGroups(f) + eq_(len(r), 1) + g = r[0] + m = g.get_match_of(g.dupes[0]) + eq_(m.percentage, 75) # 16 letters, 12 matching + +def test_similar_words(): + s = Scanner() + s.match_similar_words = True + f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')] + r = s.GetDupeGroups(f) + eq_(len(r), 2) + +def test_fields(): + s = Scanner() + s.scan_type = SCAN_TYPE_FIELDS + f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')] + r = s.GetDupeGroups(f) + eq_(len(r), 0) + +def test_fields_no_order(): + s = Scanner() + s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER + f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')] + r = s.GetDupeGroups(f) + eq_(len(r), 1) + +def test_tag_scan(): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + o1 = no('foo') + o2 = no('bar') + o1.artist = 'The White Stripes' + o1.title = 'The Air Near My Fingers' + o2.artist = 'The White Stripes' + o2.title = 'The Air Near My Fingers' + r = s.GetDupeGroups([o1,o2]) + eq_(len(r), 1) + +def test_tag_with_album_scan(): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + s.scanned_tags = set(['artist', 'album', 'title']) + o1 = no('foo') + o2 = no('bar') + o3 = no('bleh') + o1.artist = 'The White Stripes' + o1.title = 'The Air Near My Fingers' + o1.album = 'Elephant' + o2.artist = 'The White Stripes' + o2.title = 'The Air Near My Fingers' + o2.album = 'Elephant' + o3.artist = 'The White Stripes' + o3.title = 'The Air Near My Fingers' + o3.album = 'foobar' + r = s.GetDupeGroups([o1,o2,o3]) + eq_(len(r), 1) + +def test_that_dash_in_tags_dont_create_new_fields(): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + s.scanned_tags = set(['artist', 'album', 'title']) + s.min_match_percentage = 50 + o1 = no('foo') + o2 = no('bar') + o1.artist = 'The White Stripes - a' + o1.title = 'The Air Near My Fingers - a' + o1.album = 'Elephant - a' + o2.artist = 'The White Stripes - b' + o2.title = 'The Air Near My Fingers - b' + o2.album = 'Elephant - b' + r = s.GetDupeGroups([o1,o2]) + eq_(len(r), 1) + +def test_tag_scan_with_different_scanned(): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + s.scanned_tags = set(['track', 'year']) + o1 = no('foo') + o2 = no('bar') + o1.artist = 'The White Stripes' + o1.title = 'some title' + o1.track = 'foo' + o1.year = 'bar' + o2.artist = 'The White Stripes' + o2.title = 'another title' + o2.track = 'foo' + o2.year = 'bar' + r = s.GetDupeGroups([o1, o2]) + eq_(len(r), 1) + +def test_tag_scan_only_scans_existing_tags(): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + s.scanned_tags = set(['artist', 'foo']) + o1 = no('foo') + o2 = no('bar') + o1.artist = 'The White Stripes' + o1.foo = 'foo' + o2.artist = 'The White Stripes' + o2.foo = 'bar' + r = s.GetDupeGroups([o1, o2]) + eq_(len(r), 1) # Because 'foo' is not scanned, they match + +def test_tag_scan_converts_to_str(): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + s.scanned_tags = set(['track']) + o1 = no('foo') + o2 = no('bar') + o1.track = 42 + o2.track = 42 + try: + r = s.GetDupeGroups([o1, o2]) + except TypeError: + raise AssertionError() + eq_(len(r), 1) + +def test_tag_scan_non_ascii(): + s = Scanner() + s.scan_type = SCAN_TYPE_TAG + s.scanned_tags = set(['title']) + o1 = no('foo') + o2 = no('bar') + o1.title = u'foobar\u00e9' + o2.title = u'foobar\u00e9' + try: + r = s.GetDupeGroups([o1, o2]) + except UnicodeEncodeError: + raise AssertionError() + eq_(len(r), 1) + +def test_audio_content_scan(): + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT_AUDIO + f = [no('foo'), no('bar'), no('bleh')] + f[0].md5 = 'foo' + f[1].md5 = 'bar' + f[2].md5 = 'bleh' + f[0].md5partial = 'foo' + f[1].md5partial = 'foo' + f[2].md5partial = 'bleh' + f[0].audiosize = 1 + f[1].audiosize = 1 + f[2].audiosize = 1 + r = s.GetDupeGroups(f) + eq_(len(r), 1) + eq_(len(r[0]), 2) + +def test_audio_content_scan_compare_sizes_first(): + class MyFile(no): + @property + def md5partial(file): + raise AssertionError() + + s = Scanner() + s.scan_type = SCAN_TYPE_CONTENT_AUDIO + f = [MyFile('foo'), MyFile('bar')] + f[0].audiosize = 1 + f[1].audiosize = 2 + eq_(len(s.GetDupeGroups(f)), 0) + +def test_ignore_list(): + s = Scanner() + f1 = no('foobar') + f2 = no('foobar') + f3 = no('foobar') + f1.path = Path('dir1/foobar') + f2.path = Path('dir2/foobar') + f3.path = Path('dir3/foobar') + s.ignore_list.Ignore(str(f1.path),str(f2.path)) + s.ignore_list.Ignore(str(f1.path),str(f3.path)) + r = s.GetDupeGroups([f1,f2,f3]) + eq_(len(r), 1) + g = r[0] + eq_(len(g.dupes), 1) + assert f1 not in g + assert f2 in g + assert f3 in g + # Ignored matches are not counted as discarded + eq_(s.discarded_file_count, 0) + +def test_ignore_list_checks_for_unicode(): + #scanner was calling path_str for ignore list checks. Since the Path changes, it must + #be unicode(path) + s = Scanner() + f1 = no('foobar') + f2 = no('foobar') + f3 = no('foobar') + f1.path = Path(u'foo1\u00e9') + f2.path = Path(u'foo2\u00e9') + f3.path = Path(u'foo3\u00e9') + s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path)) + s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path)) + r = s.GetDupeGroups([f1,f2,f3]) + eq_(len(r), 1) + g = r[0] + eq_(len(g.dupes), 1) + assert f1 not in g + assert f2 in g + assert f3 in g + +def test_custom_match_factory(): + class MatchFactory(object): + def getmatches(self, objects, j=None): + return [Match(objects[0], objects[1], 420)] + + + s = Scanner() + s.match_factory = MatchFactory() + o1, o2 = no('foo'), no('bar') + groups = s.GetDupeGroups([o1, o2]) + eq_(len(groups), 1) + g = groups[0] + eq_(len(g), 2) + g.switch_ref(o1) + m = g.get_match_of(o2) + eq_(m, (o1, o2, 420)) + +def test_file_evaluates_to_false(): + # A very wrong way to use any() was added at some point, causing resulting group list + # to be empty. + class FalseNamedObject(NamedObject): + def __nonzero__(self): + return False + + + s = Scanner() + f1 = FalseNamedObject('foobar') + f2 = FalseNamedObject('foobar') + r = s.GetDupeGroups([f1, f2]) + eq_(len(r), 1) + +def test_size_threshold(): + # Only file equal or higher than the size_threshold in size are scanned + s = Scanner() + f1 = no('foo', 1) + f2 = no('foo', 2) + f3 = no('foo', 3) + s.size_threshold = 2 + groups = s.GetDupeGroups([f1,f2,f3]) + eq_(len(groups), 1) + [group] = groups + eq_(len(group), 2) + assert f1 not in group + assert f2 in group + assert f3 in group + +def test_tie_breaker_path_deepness(): + # If there is a tie in prioritization, path deepness is used as a tie breaker + s = Scanner() + o1, o2 = no('foo'), no('foo') + o1.path = Path('foo') + o2.path = Path('foo/bar') + [group] = s.GetDupeGroups([o1, o2]) + assert group.ref is o2 + +def test_tie_breaker_copy(): + # if copy is in the words used (even if it has a deeper path), it becomes a dupe + s = Scanner() + o1, o2 = no('foo bar Copy'), no('foo bar') + o1.path = Path('deeper/path') + o2.path = Path('foo') + [group] = s.GetDupeGroups([o1, o2]) + assert group.ref is o2 + +def test_tie_breaker_same_name_plus_digit(): + # if ref has the same words as dupe, but has some just one extra word which is a digit, it + # becomes a dupe + s = Scanner() + o1, o2 = no('foo bar 42'), no('foo bar') + o1.path = Path('deeper/path') + o2.path = Path('foo') + [group] = s.GetDupeGroups([o1, o2]) + assert group.ref is o2 + +def test_partial_group_match(): + # Count the number od discarded matches (when a file doesn't match all other dupes of the + # group) in Scanner.discarded_file_count + s = Scanner() + o1, o2, o3 = no('a b'), no('a'), no('b') + s.min_match_percentage = 50 + [group] = s.GetDupeGroups([o1, o2, o3]) + eq_(len(group), 2) + assert o1 in group + assert o2 in group + assert o3 not in group + eq_(s.discarded_file_count, 1) + + +#--- Scanner ME +def test_priorize_me(): + # in ScannerME, bitrate goes first (right after is_ref) in priorization + s = ScannerME() + o1, o2 = no('foo'), no('foo') + o1.bitrate = 1 + o2.bitrate = 2 + [group] = s.GetDupeGroups([o1, o2]) + assert group.ref is o2 +