1
0
mirror of https://github.com/arsenetar/dupeguru.git synced 2025-05-08 09:49:51 +00:00

Refactoring: modernized scaner_test and got rid of the obsolete SCAN_TYPE_TAG_WITH_ALBUM scan type const.

--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40117
This commit is contained in:
hsoft 2009-09-05 15:28:10 +00:00
parent 6d5ae99509
commit 42ebef15dd
2 changed files with 440 additions and 440 deletions

View File

@ -9,21 +9,20 @@
import logging import logging
from ignore import IgnoreList
from hsutil import job from hsutil import job
from hsutil.misc import dedupe from hsutil.misc import dedupe
from hsutil.str import get_file_ext, rem_file_ext from hsutil.str import get_file_ext, rem_file_ext
from . import engine from . import engine
from .ignore import IgnoreList
(SCAN_TYPE_FILENAME, (SCAN_TYPE_FILENAME,
SCAN_TYPE_FIELDS, SCAN_TYPE_FIELDS,
SCAN_TYPE_FIELDS_NO_ORDER, SCAN_TYPE_FIELDS_NO_ORDER,
SCAN_TYPE_TAG, SCAN_TYPE_TAG,
SCAN_TYPE_TAG_WITH_ALBUM, # Obsolete
SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT,
SCAN_TYPE_CONTENT_AUDIO) = range(7) SCAN_TYPE_CONTENT_AUDIO) = range(6)
SCANNABLE_TAGS = ['track', 'artist', 'album', 'title', 'genre', 'year'] SCANNABLE_TAGS = ['track', 'artist', 'album', 'title', 'genre', 'year']
@ -42,9 +41,6 @@ class Scanner(object):
if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER: if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
self.scan_type = SCAN_TYPE_FIELDS self.scan_type = SCAN_TYPE_FIELDS
mf.no_field_order = True mf.no_field_order = True
if self.scan_type == SCAN_TYPE_TAG_WITH_ALBUM:
self.scan_type = SCAN_TYPE_TAG
self.scanned_tags = set(['artist', 'album', 'title'])
func = { func = {
SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)), SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)), SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),

View File

@ -7,9 +7,10 @@
# which should be included with this package. The terms are also available at # which should be included with this package. The terms are also available at
# http://www.hardcoded.net/licenses/hs_license # http://www.hardcoded.net/licenses/hs_license
from nose.tools import eq_
from hsutil import job from hsutil import job
from hsutil.path import Path from hsutil.path import Path
from hsutil.testcase import TestCase
from ..engine import getwords, Match from ..engine import getwords, Match
from ..ignore import IgnoreList from ..ignore import IgnoreList
@ -25,438 +26,441 @@ class NamedObject(object):
no = NamedObject no = NamedObject
class TCScanner(TestCase): #--- Scanner
def test_empty(self): def test_empty():
s = Scanner() s = Scanner()
r = s.GetDupeGroups([]) r = s.GetDupeGroups([])
self.assertEqual([],r) eq_(r, [])
def test_default_settings(self): def test_default_settings():
s = Scanner() s = Scanner()
self.assertEqual(80,s.min_match_percentage) eq_(s.min_match_percentage, 80)
self.assertEqual(SCAN_TYPE_FILENAME,s.scan_type) eq_(s.scan_type, SCAN_TYPE_FILENAME)
self.assertEqual(True,s.mix_file_kind) eq_(s.mix_file_kind, True)
self.assertEqual(False,s.word_weighting) eq_(s.word_weighting, False)
self.assertEqual(False,s.match_similar_words) eq_(s.match_similar_words, False)
self.assert_(isinstance(s.ignore_list,IgnoreList)) assert isinstance(s.ignore_list, IgnoreList)
def test_simple_with_default_settings(self): def test_simple_with_default_settings():
s = Scanner() s = Scanner()
f = [no('foo bar'),no('foo bar'),no('foo bleh')] f = [no('foo bar'), no('foo bar'), no('foo bleh')]
r = s.GetDupeGroups(f) r = s.GetDupeGroups(f)
self.assertEqual(1,len(r)) eq_(len(r), 1)
g = r[0] g = r[0]
#'foo bleh' cannot be in the group because the default min match % is 80 #'foo bleh' cannot be in the group because the default min match % is 80
self.assertEqual(2,len(g)) eq_(len(g), 2)
self.assert_(g.ref in f[:2]) assert g.ref in f[:2]
self.assert_(g.dupes[0] in f[:2]) assert g.dupes[0] in f[:2]
def test_simple_with_lower_min_match(self): def test_simple_with_lower_min_match():
s = Scanner() s = Scanner()
s.min_match_percentage = 50 s.min_match_percentage = 50
f = [no('foo bar'),no('foo bar'),no('foo bleh')] f = [no('foo bar'), no('foo bar'), no('foo bleh')]
r = s.GetDupeGroups(f) r = s.GetDupeGroups(f)
self.assertEqual(1,len(r)) eq_(len(r), 1)
g = r[0] g = r[0]
self.assertEqual(3,len(g)) eq_(len(g), 3)
def test_trim_all_ref_groups(self): def test_trim_all_ref_groups():
s = Scanner() s = Scanner()
f = [no('foo'),no('foo'),no('bar'),no('bar')] f = [no('foo'), no('foo'), no('bar'), no('bar')]
f[2].is_ref = True f[2].is_ref = True
f[3].is_ref = True f[3].is_ref = True
r = s.GetDupeGroups(f) r = s.GetDupeGroups(f)
self.assertEqual(1,len(r)) eq_(len(r), 1)
def test_priorize(self): def test_priorize():
s = Scanner() s = Scanner()
f = [no('foo'),no('foo'),no('bar'),no('bar')] f = [no('foo'), no('foo'), no('bar'), no('bar')]
f[1].size = 2 f[1].size = 2
f[2].size = 3 f[2].size = 3
f[3].is_ref = True f[3].is_ref = True
r = s.GetDupeGroups(f) r = s.GetDupeGroups(f)
g1,g2 = r g1, g2 = r
self.assert_(f[1] in (g1.ref,g2.ref)) assert f[1] in (g1.ref,g2.ref)
self.assert_(f[0] in (g1.dupes[0],g2.dupes[0])) assert f[0] in (g1.dupes[0],g2.dupes[0])
self.assert_(f[3] in (g1.ref,g2.ref)) assert f[3] in (g1.ref,g2.ref)
self.assert_(f[2] in (g1.dupes[0],g2.dupes[0])) assert f[2] in (g1.dupes[0],g2.dupes[0])
def test_content_scan(self): def test_content_scan():
s = Scanner() s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'), no('bar'), no('bleh')] f = [no('foo'), no('bar'), no('bleh')]
f[0].md5 = 'foobar' f[0].md5 = 'foobar'
f[1].md5 = 'foobar' f[1].md5 = 'foobar'
f[2].md5 = 'bleh' f[2].md5 = 'bleh'
r = s.GetDupeGroups(f) r = s.GetDupeGroups(f)
self.assertEqual(len(r), 1) eq_(len(r), 1)
self.assertEqual(len(r[0]), 2) eq_(len(r[0]), 2)
self.assertEqual(s.discarded_file_count, 0) # don't count the different md5 as discarded! eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded!
def test_content_scan_compare_sizes_first(self): def test_content_scan_compare_sizes_first():
class MyFile(no): class MyFile(no):
def get_md5(file): @property
self.fail() def md5(file):
md5 = property(get_md5) raise AssertionError()
s = Scanner() s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT s.scan_type = SCAN_TYPE_CONTENT
f = [MyFile('foo',1),MyFile('bar',2)] f = [MyFile('foo', 1), MyFile('bar', 2)]
self.assertEqual(0,len(s.GetDupeGroups(f))) eq_(len(s.GetDupeGroups(f)), 0)
def test_min_match_perc_doesnt_matter_for_content_scan(self): def test_min_match_perc_doesnt_matter_for_content_scan():
s = Scanner() s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'),no('bar'),no('bleh')] f = [no('foo'), no('bar'), no('bleh')]
f[0].md5 = 'foobar' f[0].md5 = 'foobar'
f[1].md5 = 'foobar' f[1].md5 = 'foobar'
f[2].md5 = 'bleh' f[2].md5 = 'bleh'
s.min_match_percentage = 101 s.min_match_percentage = 101
r = s.GetDupeGroups(f) r = s.GetDupeGroups(f)
self.assertEqual(1,len(r)) eq_(len(r), 1)
self.assertEqual(2,len(r[0])) eq_(len(r[0]), 2)
s.min_match_percentage = 0 s.min_match_percentage = 0
r = s.GetDupeGroups(f) r = s.GetDupeGroups(f)
self.assertEqual(1,len(r)) eq_(len(r), 1)
self.assertEqual(2,len(r[0])) eq_(len(r[0]), 2)
def test_content_scan_puts_md5_in_words_at_the_end(self): def test_content_scan_puts_md5_in_words_at_the_end():
s = Scanner() s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'),no('bar')] f = [no('foo'),no('bar')]
f[0].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' f[0].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
f[1].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' f[1].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
r = s.GetDupeGroups(f) r = s.GetDupeGroups(f)
g = r[0] g = r[0]
self.assertEqual(['--'],g.ref.words) eq_(g.ref.words, ['--'])
self.assertEqual(['--'],g.dupes[0].words) eq_(g.dupes[0].words, ['--'])
def test_extension_is_not_counted_in_filename_scan(self): def test_extension_is_not_counted_in_filename_scan():
s = Scanner() s = Scanner()
s.min_match_percentage = 100 s.min_match_percentage = 100
f = [no('foo.bar'),no('foo.bleh')] f = [no('foo.bar'), no('foo.bleh')]
r = s.GetDupeGroups(f) r = s.GetDupeGroups(f)
self.assertEqual(1,len(r)) eq_(len(r), 1)
self.assertEqual(2,len(r[0])) eq_(len(r[0]), 2)
def test_job(self): def test_job():
def do_progress(progress,desc=''): def do_progress(progress, desc=''):
log.append(progress) log.append(progress)
return True return True
s = Scanner()
log = []
f = [no('foo bar'),no('foo bar'),no('foo bleh')]
r = s.GetDupeGroups(f, job.Job(1,do_progress))
self.assertEqual(0,log[0])
self.assertEqual(100,log[-1])
def test_mix_file_kind(self): s = Scanner()
s = Scanner() log = []
s.mix_file_kind = False f = [no('foo bar'), no('foo bar'), no('foo bleh')]
f = [no('foo.1'),no('foo.2')] r = s.GetDupeGroups(f, job.Job(1, do_progress))
r = s.GetDupeGroups(f) eq_(log[0], 0)
self.assertEqual(0,len(r)) eq_(log[-1], 100)
def test_word_weighting(self): def test_mix_file_kind():
s = Scanner() s = Scanner()
s.min_match_percentage = 75 s.mix_file_kind = False
s.word_weighting = True f = [no('foo.1'), no('foo.2')]
f = [no('foo bar'),no('foo bar bleh')] r = s.GetDupeGroups(f)
r = s.GetDupeGroups(f) eq_(len(r), 0)
self.assertEqual(1,len(r))
g = r[0]
m = g.get_match_of(g.dupes[0])
self.assertEqual(75,m.percentage) # 16 letters, 12 matching
def test_similar_words(self): def test_word_weighting():
s = Scanner() s = Scanner()
s.match_similar_words = True s.min_match_percentage = 75
f = [no('The White Stripes'),no('The Whites Stripe'),no('Limp Bizkit'),no('Limp Bizkitt')] s.word_weighting = True
r = s.GetDupeGroups(f) f = [no('foo bar'), no('foo bar bleh')]
self.assertEqual(2,len(r)) r = s.GetDupeGroups(f)
eq_(len(r), 1)
g = r[0]
m = g.get_match_of(g.dupes[0])
eq_(m.percentage, 75) # 16 letters, 12 matching
def test_fields(self): def test_similar_words():
s = Scanner() s = Scanner()
s.scan_type = SCAN_TYPE_FIELDS s.match_similar_words = True
f = [no('The White Stripes - Little Ghost'),no('The White Stripes - Little Acorn')] f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')]
r = s.GetDupeGroups(f) r = s.GetDupeGroups(f)
self.assertEqual(0,len(r)) eq_(len(r), 2)
def test_fields_no_order(self): def test_fields():
s = Scanner() s = Scanner()
s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER s.scan_type = SCAN_TYPE_FIELDS
f = [no('The White Stripes - Little Ghost'),no('Little Ghost - The White Stripes')] f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')]
r = s.GetDupeGroups(f) r = s.GetDupeGroups(f)
self.assertEqual(1,len(r)) eq_(len(r), 0)
def test_tag_scan(self): def test_fields_no_order():
s = Scanner() s = Scanner()
s.scan_type = SCAN_TYPE_TAG s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER
o1 = no('foo') f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')]
o2 = no('bar') r = s.GetDupeGroups(f)
o1.artist = 'The White Stripes' eq_(len(r), 1)
o1.title = 'The Air Near My Fingers'
o2.artist = 'The White Stripes'
o2.title = 'The Air Near My Fingers'
r = s.GetDupeGroups([o1,o2])
self.assertEqual(1,len(r))
def test_tag_with_album_scan(self): def test_tag_scan():
s = Scanner() s = Scanner()
s.scan_type = SCAN_TYPE_TAG_WITH_ALBUM s.scan_type = SCAN_TYPE_TAG
o1 = no('foo') o1 = no('foo')
o2 = no('bar') o2 = no('bar')
o3 = no('bleh') o1.artist = 'The White Stripes'
o1.artist = 'The White Stripes' o1.title = 'The Air Near My Fingers'
o1.title = 'The Air Near My Fingers' o2.artist = 'The White Stripes'
o1.album = 'Elephant' o2.title = 'The Air Near My Fingers'
o2.artist = 'The White Stripes' r = s.GetDupeGroups([o1,o2])
o2.title = 'The Air Near My Fingers' eq_(len(r), 1)
o2.album = 'Elephant'
o3.artist = 'The White Stripes'
o3.title = 'The Air Near My Fingers'
o3.album = 'foobar'
r = s.GetDupeGroups([o1,o2,o3])
self.assertEqual(1,len(r))
def test_that_dash_in_tags_dont_create_new_fields(self): def test_tag_with_album_scan():
s = Scanner() s = Scanner()
s.scan_type = SCAN_TYPE_TAG_WITH_ALBUM s.scan_type = SCAN_TYPE_TAG
s.min_match_percentage = 50 s.scanned_tags = set(['artist', 'album', 'title'])
o1 = no('foo') o1 = no('foo')
o2 = no('bar') o2 = no('bar')
o1.artist = 'The White Stripes - a' o3 = no('bleh')
o1.title = 'The Air Near My Fingers - a' o1.artist = 'The White Stripes'
o1.album = 'Elephant - a' o1.title = 'The Air Near My Fingers'
o2.artist = 'The White Stripes - b' o1.album = 'Elephant'
o2.title = 'The Air Near My Fingers - b' o2.artist = 'The White Stripes'
o2.album = 'Elephant - b' o2.title = 'The Air Near My Fingers'
r = s.GetDupeGroups([o1,o2]) o2.album = 'Elephant'
self.assertEqual(1,len(r)) o3.artist = 'The White Stripes'
o3.title = 'The Air Near My Fingers'
o3.album = 'foobar'
r = s.GetDupeGroups([o1,o2,o3])
eq_(len(r), 1)
def test_tag_scan_with_different_scanned(self): def test_that_dash_in_tags_dont_create_new_fields():
s = Scanner() s = Scanner()
s.scan_type = SCAN_TYPE_TAG s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['track', 'year']) s.scanned_tags = set(['artist', 'album', 'title'])
o1 = no('foo') s.min_match_percentage = 50
o2 = no('bar') o1 = no('foo')
o1.artist = 'The White Stripes' o2 = no('bar')
o1.title = 'some title' o1.artist = 'The White Stripes - a'
o1.track = 'foo' o1.title = 'The Air Near My Fingers - a'
o1.year = 'bar' o1.album = 'Elephant - a'
o2.artist = 'The White Stripes' o2.artist = 'The White Stripes - b'
o2.title = 'another title' o2.title = 'The Air Near My Fingers - b'
o2.track = 'foo' o2.album = 'Elephant - b'
o2.year = 'bar' r = s.GetDupeGroups([o1,o2])
eq_(len(r), 1)
def test_tag_scan_with_different_scanned():
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['track', 'year'])
o1 = no('foo')
o2 = no('bar')
o1.artist = 'The White Stripes'
o1.title = 'some title'
o1.track = 'foo'
o1.year = 'bar'
o2.artist = 'The White Stripes'
o2.title = 'another title'
o2.track = 'foo'
o2.year = 'bar'
r = s.GetDupeGroups([o1, o2])
eq_(len(r), 1)
def test_tag_scan_only_scans_existing_tags():
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['artist', 'foo'])
o1 = no('foo')
o2 = no('bar')
o1.artist = 'The White Stripes'
o1.foo = 'foo'
o2.artist = 'The White Stripes'
o2.foo = 'bar'
r = s.GetDupeGroups([o1, o2])
eq_(len(r), 1) # Because 'foo' is not scanned, they match
def test_tag_scan_converts_to_str():
s = Scanner()
s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['track'])
o1 = no('foo')
o2 = no('bar')
o1.track = 42
o2.track = 42
try:
r = s.GetDupeGroups([o1, o2]) r = s.GetDupeGroups([o1, o2])
self.assertEqual(1, len(r)) except TypeError:
raise AssertionError()
eq_(len(r), 1)
def test_tag_scan_only_scans_existing_tags(self): def test_tag_scan_non_ascii():
s = Scanner() s = Scanner()
s.scan_type = SCAN_TYPE_TAG s.scan_type = SCAN_TYPE_TAG
s.scanned_tags = set(['artist', 'foo']) s.scanned_tags = set(['title'])
o1 = no('foo') o1 = no('foo')
o2 = no('bar') o2 = no('bar')
o1.artist = 'The White Stripes' o1.title = u'foobar\u00e9'
o1.foo = 'foo' o2.title = u'foobar\u00e9'
o2.artist = 'The White Stripes' try:
o2.foo = 'bar'
r = s.GetDupeGroups([o1, o2]) r = s.GetDupeGroups([o1, o2])
self.assertEqual(1, len(r)) # Because 'foo' is not scanned, they match except UnicodeEncodeError:
raise AssertionError()
eq_(len(r), 1)
def test_tag_scan_converts_to_str(self): def test_audio_content_scan():
s = Scanner() s = Scanner()
s.scan_type = SCAN_TYPE_TAG s.scan_type = SCAN_TYPE_CONTENT_AUDIO
s.scanned_tags = set(['track']) f = [no('foo'), no('bar'), no('bleh')]
o1 = no('foo') f[0].md5 = 'foo'
o2 = no('bar') f[1].md5 = 'bar'
o1.track = 42 f[2].md5 = 'bleh'
o2.track = 42 f[0].md5partial = 'foo'
try: f[1].md5partial = 'foo'
r = s.GetDupeGroups([o1, o2]) f[2].md5partial = 'bleh'
except TypeError: f[0].audiosize = 1
self.fail() f[1].audiosize = 1
self.assertEqual(1, len(r)) f[2].audiosize = 1
r = s.GetDupeGroups(f)
eq_(len(r), 1)
eq_(len(r[0]), 2)
def test_tag_scan_non_ascii(self): def test_audio_content_scan_compare_sizes_first():
s = Scanner() class MyFile(no):
s.scan_type = SCAN_TYPE_TAG @property
s.scanned_tags = set(['title']) def md5partial(file):
o1 = no('foo') raise AssertionError()
o2 = no('bar')
o1.title = u'foobar\u00e9'
o2.title = u'foobar\u00e9'
try:
r = s.GetDupeGroups([o1, o2])
except UnicodeEncodeError:
self.fail()
self.assertEqual(1, len(r))
def test_audio_content_scan(self): s = Scanner()
s = Scanner() s.scan_type = SCAN_TYPE_CONTENT_AUDIO
s.scan_type = SCAN_TYPE_CONTENT_AUDIO f = [MyFile('foo'), MyFile('bar')]
f = [no('foo'),no('bar'),no('bleh')] f[0].audiosize = 1
f[0].md5 = 'foo' f[1].audiosize = 2
f[1].md5 = 'bar' eq_(len(s.GetDupeGroups(f)), 0)
f[2].md5 = 'bleh'
f[0].md5partial = 'foo'
f[1].md5partial = 'foo'
f[2].md5partial = 'bleh'
f[0].audiosize = 1
f[1].audiosize = 1
f[2].audiosize = 1
r = s.GetDupeGroups(f)
self.assertEqual(1,len(r))
self.assertEqual(2,len(r[0]))
def test_audio_content_scan_compare_sizes_first(self): def test_ignore_list():
class MyFile(no): s = Scanner()
def get_md5(file): f1 = no('foobar')
self.fail() f2 = no('foobar')
md5partial = property(get_md5) f3 = no('foobar')
f1.path = Path('dir1/foobar')
f2.path = Path('dir2/foobar')
f3.path = Path('dir3/foobar')
s.ignore_list.Ignore(str(f1.path),str(f2.path))
s.ignore_list.Ignore(str(f1.path),str(f3.path))
r = s.GetDupeGroups([f1,f2,f3])
eq_(len(r), 1)
g = r[0]
eq_(len(g.dupes), 1)
assert f1 not in g
assert f2 in g
assert f3 in g
# Ignored matches are not counted as discarded
eq_(s.discarded_file_count, 0)
s = Scanner() def test_ignore_list_checks_for_unicode():
s.scan_type = SCAN_TYPE_CONTENT_AUDIO #scanner was calling path_str for ignore list checks. Since the Path changes, it must
f = [MyFile('foo'),MyFile('bar')] #be unicode(path)
f[0].audiosize = 1 s = Scanner()
f[1].audiosize = 2 f1 = no('foobar')
self.assertEqual(0,len(s.GetDupeGroups(f))) f2 = no('foobar')
f3 = no('foobar')
f1.path = Path(u'foo1\u00e9')
f2.path = Path(u'foo2\u00e9')
f3.path = Path(u'foo3\u00e9')
s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path))
s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path))
r = s.GetDupeGroups([f1,f2,f3])
eq_(len(r), 1)
g = r[0]
eq_(len(g.dupes), 1)
assert f1 not in g
assert f2 in g
assert f3 in g
def test_ignore_list(self): def test_custom_match_factory():
s = Scanner() class MatchFactory(object):
f1 = no('foobar') def getmatches(self, objects, j=None):
f2 = no('foobar') return [Match(objects[0], objects[1], 420)]
f3 = no('foobar')
f1.path = Path('dir1/foobar')
f2.path = Path('dir2/foobar')
f3.path = Path('dir3/foobar')
s.ignore_list.Ignore(str(f1.path),str(f2.path))
s.ignore_list.Ignore(str(f1.path),str(f3.path))
r = s.GetDupeGroups([f1,f2,f3])
self.assertEqual(1,len(r))
g = r[0]
self.assertEqual(1,len(g.dupes))
self.assert_(f1 not in g)
self.assert_(f2 in g)
self.assert_(f3 in g)
# Ignored matches are not counted as discarded
self.assertEqual(s.discarded_file_count, 0)
def test_ignore_list_checks_for_unicode(self):
#scanner was calling path_str for ignore list checks. Since the Path changes, it must
#be unicode(path)
s = Scanner()
f1 = no('foobar')
f2 = no('foobar')
f3 = no('foobar')
f1.path = Path(u'foo1\u00e9')
f2.path = Path(u'foo2\u00e9')
f3.path = Path(u'foo3\u00e9')
s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path))
s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path))
r = s.GetDupeGroups([f1,f2,f3])
self.assertEqual(1,len(r))
g = r[0]
self.assertEqual(1,len(g.dupes))
self.assert_(f1 not in g)
self.assert_(f2 in g)
self.assert_(f3 in g)
def test_custom_match_factory(self):
class MatchFactory(object):
def getmatches(self,objects,j=None):
return [Match(objects[0], objects[1], 420)]
s = Scanner() s = Scanner()
s.match_factory = MatchFactory() s.match_factory = MatchFactory()
o1,o2 = no('foo'),no('bar') o1, o2 = no('foo'), no('bar')
groups = s.GetDupeGroups([o1,o2]) groups = s.GetDupeGroups([o1, o2])
self.assertEqual(1,len(groups)) eq_(len(groups), 1)
g = groups[0] g = groups[0]
self.assertEqual(2,len(g)) eq_(len(g), 2)
g.switch_ref(o1) g.switch_ref(o1)
m = g.get_match_of(o2) m = g.get_match_of(o2)
self.assertEqual((o1,o2,420),m) eq_(m, (o1, o2, 420))
def test_file_evaluates_to_false(self): def test_file_evaluates_to_false():
# A very wrong way to use any() was added at some point, causing resulting group list # A very wrong way to use any() was added at some point, causing resulting group list
# to be empty. # to be empty.
class FalseNamedObject(NamedObject): class FalseNamedObject(NamedObject):
def __nonzero__(self): def __nonzero__(self):
return False return False
s = Scanner() s = Scanner()
f1 = FalseNamedObject('foobar') f1 = FalseNamedObject('foobar')
f2 = FalseNamedObject('foobar') f2 = FalseNamedObject('foobar')
r = s.GetDupeGroups([f1,f2]) r = s.GetDupeGroups([f1, f2])
self.assertEqual(1,len(r)) eq_(len(r), 1)
def test_size_threshold(self): def test_size_threshold():
# Only file equal or higher than the size_threshold in size are scanned # Only file equal or higher than the size_threshold in size are scanned
s = Scanner() s = Scanner()
f1 = no('foo', 1) f1 = no('foo', 1)
f2 = no('foo', 2) f2 = no('foo', 2)
f3 = no('foo', 3) f3 = no('foo', 3)
s.size_threshold = 2 s.size_threshold = 2
groups = s.GetDupeGroups([f1,f2,f3]) groups = s.GetDupeGroups([f1,f2,f3])
self.assertEqual(len(groups), 1) eq_(len(groups), 1)
[group] = groups [group] = groups
self.assertEqual(len(group), 2) eq_(len(group), 2)
self.assertTrue(f1 not in group) assert f1 not in group
self.assertTrue(f2 in group) assert f2 in group
self.assertTrue(f3 in group) assert f3 in group
def test_tie_breaker_path_deepness(self): def test_tie_breaker_path_deepness():
# If there is a tie in prioritization, path deepness is used as a tie breaker # If there is a tie in prioritization, path deepness is used as a tie breaker
s = Scanner() s = Scanner()
o1, o2 = no('foo'), no('foo') o1, o2 = no('foo'), no('foo')
o1.path = Path('foo') o1.path = Path('foo')
o2.path = Path('foo/bar') o2.path = Path('foo/bar')
[group] = s.GetDupeGroups([o1, o2]) [group] = s.GetDupeGroups([o1, o2])
self.assertTrue(group.ref is o2) assert group.ref is o2
def test_tie_breaker_copy(self): def test_tie_breaker_copy():
# if copy is in the words used (even if it has a deeper path), it becomes a dupe # if copy is in the words used (even if it has a deeper path), it becomes a dupe
s = Scanner() s = Scanner()
o1, o2 = no('foo bar Copy'), no('foo bar') o1, o2 = no('foo bar Copy'), no('foo bar')
o1.path = Path('deeper/path') o1.path = Path('deeper/path')
o2.path = Path('foo') o2.path = Path('foo')
[group] = s.GetDupeGroups([o1, o2]) [group] = s.GetDupeGroups([o1, o2])
self.assertTrue(group.ref is o2) assert group.ref is o2
def test_tie_breaker_same_name_plus_digit(self): def test_tie_breaker_same_name_plus_digit():
# if ref has the same words as dupe, but has some just one extra word which is a digit, it # if ref has the same words as dupe, but has some just one extra word which is a digit, it
# becomes a dupe # becomes a dupe
s = Scanner() s = Scanner()
o1, o2 = no('foo bar 42'), no('foo bar') o1, o2 = no('foo bar 42'), no('foo bar')
o1.path = Path('deeper/path') o1.path = Path('deeper/path')
o2.path = Path('foo') o2.path = Path('foo')
[group] = s.GetDupeGroups([o1, o2]) [group] = s.GetDupeGroups([o1, o2])
self.assertTrue(group.ref is o2) assert group.ref is o2
def test_partial_group_match(self): def test_partial_group_match():
# Count the number od discarded matches (when a file doesn't match all other dupes of the # Count the number od discarded matches (when a file doesn't match all other dupes of the
# group) in Scanner.discarded_file_count # group) in Scanner.discarded_file_count
s = Scanner() s = Scanner()
o1, o2, o3 = no('a b'), no('a'), no('b') o1, o2, o3 = no('a b'), no('a'), no('b')
s.min_match_percentage = 50 s.min_match_percentage = 50
[group] = s.GetDupeGroups([o1, o2, o3]) [group] = s.GetDupeGroups([o1, o2, o3])
self.assertEqual(len(group), 2) eq_(len(group), 2)
self.assertTrue(o1 in group) assert o1 in group
self.assertTrue(o2 in group) assert o2 in group
self.assertTrue(o3 not in group) assert o3 not in group
self.assertEqual(s.discarded_file_count, 1) eq_(s.discarded_file_count, 1)
class TCScannerME(TestCase): #--- Scanner ME
def test_priorize(self): def test_priorize_me():
# in ScannerME, bitrate goes first (right after is_ref) in priorization # in ScannerME, bitrate goes first (right after is_ref) in priorization
s = ScannerME() s = ScannerME()
o1, o2 = no('foo'), no('foo') o1, o2 = no('foo'), no('foo')
o1.bitrate = 1 o1.bitrate = 1
o2.bitrate = 2 o2.bitrate = 2
[group] = s.GetDupeGroups([o1, o2]) [group] = s.GetDupeGroups([o1, o2])
self.assertTrue(group.ref is o2) assert group.ref is o2